diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 637d11d5e..fd241bdcc 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -1,30 +1,24 @@ -#!/bin/sh +#!/bin/bash # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. -base=/opt/rocm/hcc/bin/clang-format -format="" +format=/opt/rocm/hcc/bin/clang-format -# Redirect output to stderr. -exec 1>&2 +# Redirect stdout to stderr. +exec >&2 - # check if clang-format is installed -type "$base" >/dev/null 2>&1 && format="$base" - -# no versions of clang-format are installed -if [ -z "$format" ] -then - echo "$base is not installed. Pre-commit hook will not be executed." +# check if clang-format is installed +if [[ ! -x $format ]]; then + echo "$format is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) -if git rev-parse --verify HEAD >/dev/null 2>&1 -then +if git rev-parse --verify HEAD >/dev/null 2>&1; then against=HEAD else # Initial commit: diff against an empty tree object @@ -39,10 +33,10 @@ for file in $(git diff-index --cached --name-only $against); do done # do the formatting -for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') -do - if [ -e "$file" ] - then +for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$'); do + if [[ -e "$file" ]]; then + sed -i -e 's/[[:space:]]*$//' "$file" # Remove whitespace at end of lines + sed -i -e '$a\' "$file" # Add missing newline to end of file echo "$format $file" "$format" -i -style=file "$file" fi diff --git a/clients/include/rocblas.hpp b/clients/include/rocblas.hpp index 059060e42..ebbe90d04 100644 --- a/clients/include/rocblas.hpp +++ b/clients/include/rocblas.hpp @@ -900,13 +900,13 @@ rocblas_status (*rocblas_ger_strided_batched)(rocblas_handle handle, const T* alpha, const T* x, rocblas_int incx, - rocblas_int stride_x, + rocblas_stride stride_x, const T* y, rocblas_int incy, - rocblas_int stride_y, + rocblas_stride stride_y, T* A, rocblas_int lda, - rocblas_int stride_a, + rocblas_stride stride_a, rocblas_int batch_count); template <> @@ -1006,14 +1006,14 @@ rocblas_status (*rocblas_gemv_strided_batched)(rocblas_handle handle, const T* alpha, const T* A, rocblas_int lda, - rocblas_int stride_a, + rocblas_stride stride_a, const T* x, rocblas_int incx, - rocblas_int stride_x, + rocblas_stride stride_x, const T* beta, T* y, rocblas_int incy, - rocblas_int stride_y, + rocblas_stride stride_y, rocblas_int batch_count); template <> diff --git a/library/include/rocblas-functions.h b/library/include/rocblas-functions.h index 3ac8da557..6bb25f6e8 100644 --- a/library/include/rocblas-functions.h +++ b/library/include/rocblas-functions.h @@ -2544,14 +2544,14 @@ ROCBLAS_EXPORT rocblas_status rocblas_sgemv_strided_batched(rocblas_handle ha const float* alpha, const float* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const float* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const float* beta, float* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count); ROCBLAS_EXPORT rocblas_status rocblas_dgemv_strided_batched(rocblas_handle handle, @@ -2561,14 +2561,14 @@ ROCBLAS_EXPORT rocblas_status rocblas_dgemv_strided_batched(rocblas_handle ha const double* alpha, const double* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const double* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const double* beta, double* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count); ROCBLAS_EXPORT rocblas_status rocblas_cgemv_strided_batched(rocblas_handle handle, @@ -2578,14 +2578,14 @@ ROCBLAS_EXPORT rocblas_status rocblas_cgemv_strided_batched(rocblas_handle const rocblas_float_complex* alpha, const rocblas_float_complex* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const rocblas_float_complex* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const rocblas_float_complex* beta, rocblas_float_complex* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count); ROCBLAS_EXPORT rocblas_status rocblas_zgemv_strided_batched(rocblas_handle handle, @@ -2595,14 +2595,14 @@ ROCBLAS_EXPORT rocblas_status rocblas_zgemv_strided_batched(rocblas_handle const rocblas_double_complex* alpha, const rocblas_double_complex* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const rocblas_double_complex* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const rocblas_double_complex* beta, rocblas_double_complex* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count); /*! \brief BLAS Level 2 API @@ -2951,13 +2951,13 @@ ROCBLAS_EXPORT rocblas_status rocblas_sger_strided_batched(rocblas_handle handle const float* alpha, const float* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const float* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, float* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, rocblas_int batch_count); ROCBLAS_EXPORT rocblas_status rocblas_dger_strided_batched(rocblas_handle handle, @@ -2966,13 +2966,13 @@ ROCBLAS_EXPORT rocblas_status rocblas_dger_strided_batched(rocblas_handle handle const double* alpha, const double* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const double* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, double* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, rocblas_int batch_count); /* not implemented diff --git a/library/src/blas2/rocblas_gemv_strided_batched.cpp b/library/src/blas2/rocblas_gemv_strided_batched.cpp index 21212dfc4..d67b38a69 100644 --- a/library/src/blas2/rocblas_gemv_strided_batched.cpp +++ b/library/src/blas2/rocblas_gemv_strided_batched.cpp @@ -28,14 +28,14 @@ namespace const T* alpha, const T* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const T* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const T* beta, T* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count) { if(!handle) @@ -188,14 +188,14 @@ rocblas_status rocblas_sgemv_strided_batched(rocblas_handle handle, const float* alpha, const float* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const float* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const float* beta, float* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count) { return rocblas_gemv_strided_batched_impl(handle, @@ -223,14 +223,14 @@ rocblas_status rocblas_dgemv_strided_batched(rocblas_handle handle, const double* alpha, const double* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const double* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const double* beta, double* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count) { return rocblas_gemv_strided_batched_impl(handle, @@ -258,14 +258,14 @@ rocblas_status rocblas_cgemv_strided_batched(rocblas_handle handle const rocblas_float_complex* alpha, const rocblas_float_complex* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const rocblas_float_complex* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const rocblas_float_complex* beta, rocblas_float_complex* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count) { return rocblas_gemv_strided_batched_impl(handle, @@ -293,14 +293,14 @@ rocblas_status rocblas_zgemv_strided_batched(rocblas_handle handl const rocblas_double_complex* alpha, const rocblas_double_complex* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, const rocblas_double_complex* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const rocblas_double_complex* beta, rocblas_double_complex* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, rocblas_int batch_count) { return rocblas_gemv_strided_batched_impl(handle, diff --git a/library/src/blas2/rocblas_ger_strided_batched.cpp b/library/src/blas2/rocblas_ger_strided_batched.cpp index b7de68dbc..f5069e47a 100644 --- a/library/src/blas2/rocblas_ger_strided_batched.cpp +++ b/library/src/blas2/rocblas_ger_strided_batched.cpp @@ -23,13 +23,13 @@ namespace const T* alpha, const T* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const T* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, T* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, rocblas_int batch_count) { if(!handle) @@ -170,13 +170,13 @@ rocblas_status rocblas_sger_strided_batched(rocblas_handle handle, const float* alpha, const float* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const float* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, float* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, rocblas_int batch_count) { return rocblas_ger_strided_batched_impl( @@ -189,13 +189,13 @@ rocblas_status rocblas_dger_strided_batched(rocblas_handle handle, const double* alpha, const double* x, rocblas_int incx, - rocblas_int stridex, + rocblas_stride stridex, const double* y, rocblas_int incy, - rocblas_int stridey, + rocblas_stride stridey, double* A, rocblas_int lda, - rocblas_int strideA, + rocblas_stride strideA, rocblas_int batch_count) { return rocblas_ger_strided_batched_impl( diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml index 4eee15c76..6eec64103 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -67363,6 +67363,718 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_NLCA1_NLCB1_PGR0_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [4, 1028.02] @@ -72598,4 +73310,12 @@ - [400, 8824.43] - - [784, 128, 64, 512] - [402, 9393.09] + - - [65, 1024, 1, 6400] + - [413, 3556.98] + - - [256, 4096, 1, 6400] + - [414, 10132.4] + - - [1024, 4096, 1, 64] + - [415, 6918.44] + - - [1024, 4096, 1, 6336] + - [416, 10393.9] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml index 56991285b..6b2c8d8df 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -116061,6 +116061,543 @@ WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -122532,4 +123069,10 @@ - [714, 7844.54] - - [1225, 64, 64, 256] - [721, 8721.52] + - - [65, 6400, 1, 1024] + - [722, 2839.89] + - - [256, 6400, 1, 4096] + - [723, 7361.66] + - - [1024, 64, 1, 4096] + - [724, 3787.18] - null diff --git a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml index 853958bb8..d46d5efc5 100644 --- a/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/archive/vega20_Cijk_Alik_Bljk_SB.yaml @@ -96477,6 +96477,539 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL1_GRVW2_GSU1_PGR0_PLR1_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -102748,4 +103281,10 @@ - [566, 7023.59] - - [192, 128, 9, 9792] - [557, 5400.54] + - - [1024, 6400, 1, 65] + - [590, 5298.31] + - - [4096, 6400, 1, 256] + - [591, 9150.88] + - - [4096, 64, 1, 1024] + - [592, 5482.75] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml index e04f05ccb..5c1b95cb4 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -67363,6 +67363,867 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_NLCA1_NLCB1_PGR0_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -67383,7 +68244,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67396,21 +68257,170 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 8 + LSCB: 16 LSPA: 4 LSPB: 8 LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 896 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -67429,9 +68439,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67439,8 +68449,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -67491,12 +68501,310 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 413 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -67509,7 +68817,156 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -67640,7 +69097,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 414 + SolutionIndex: 423 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67659,7 +69116,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -67789,7 +69246,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 415 + SolutionIndex: 424 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67808,7 +69265,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -67938,7 +69395,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 416 + SolutionIndex: 425 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67957,7 +69414,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68087,7 +69544,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 417 + SolutionIndex: 426 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -68106,7 +69563,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68236,7 +69693,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 418 + SolutionIndex: 427 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -68255,7 +69712,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68268,7 +69725,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68277,7 +69734,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -68290,25 +69747,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68321,11 +69778,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68335,13 +69792,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68385,12 +69842,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 419 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -68403,8 +69860,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68417,7 +69874,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68442,18 +69899,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -68470,10 +69927,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -68482,14 +69939,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -68534,17 +69991,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 420 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -68553,7 +70010,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68566,7 +70023,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68574,39 +70031,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 4 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68619,11 +70076,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68631,14 +70088,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -68683,26 +70140,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 421 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68715,7 +70172,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68723,39 +70180,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68768,11 +70225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68780,15 +70237,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68832,26 +70289,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 422 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68864,7 +70321,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68873,7 +70330,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -68886,25 +70343,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68917,7 +70374,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -68931,11 +70388,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -68981,8 +70438,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 @@ -69000,7 +70457,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69021,35 +70478,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -69068,9 +70525,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69078,13 +70535,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -69130,25 +70587,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 4, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -69171,7 +70628,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69184,25 +70641,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69216,10 +70673,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69229,13 +70686,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69279,12 +70736,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -69297,8 +70754,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69337,17 +70794,17 @@ InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1664 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -69366,9 +70823,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69376,14 +70833,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -69428,17 +70885,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -69446,8 +70903,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69468,39 +70925,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 8 LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69514,10 +70971,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69525,8 +70982,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -69577,26 +71034,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69609,7 +71066,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -69634,22 +71091,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69662,10 +71119,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 24 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69674,15 +71131,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69726,17 +71183,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -69744,78 +71201,78 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69823,13 +71280,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -69875,46 +71332,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -69924,47 +71381,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69974,11 +71431,11 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70024,48 +71481,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70073,47 +71530,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70121,15 +71578,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70173,12 +71630,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -70186,35 +71643,35 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70222,43 +71679,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -70270,15 +71727,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70322,8 +71779,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -70335,33 +71792,33 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -70371,43 +71828,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 8 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -70419,15 +71876,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70471,8 +71928,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -70484,13 +71941,13 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -70503,7 +71960,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70511,8 +71968,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70524,26 +71981,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70556,10 +72013,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70568,13 +72025,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70620,20 +72077,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -70652,7 +72109,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70660,39 +72117,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70705,11 +72162,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70717,7 +72174,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 @@ -70769,24 +72226,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -70801,7 +72258,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70809,39 +72266,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70854,11 +72311,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70866,13 +72323,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70918,24 +72375,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -70950,7 +72407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70967,36 +72424,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -71015,7 +72472,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -71023,7 +72480,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71067,8 +72524,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71085,7 +72542,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71108,7 +72565,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71121,41 +72578,41 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -71172,7 +72629,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71216,17 +72673,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -71234,7 +72691,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71257,7 +72714,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71270,17 +72727,17 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1792 LdsNumElementsAlignedA: 512 @@ -71313,15 +72770,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71365,17 +72822,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -71383,7 +72840,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71405,39 +72862,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -71451,10 +72908,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71462,15 +72919,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71514,25 +72971,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71555,7 +73012,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71568,21 +73025,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -71592,18 +73049,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71611,15 +73068,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71663,17 +73120,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -71681,8 +73138,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -71703,8 +73160,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71715,19 +73172,19 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 1024 LdsNumElementsAlignedA: 256 @@ -71741,14 +73198,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -71760,7 +73217,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -71768,7 +73225,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71812,8 +73269,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71825,13 +73282,13 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -71853,7 +73310,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71866,19 +73323,19 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 @@ -71890,14 +73347,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -71909,15 +73366,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71961,8 +73418,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71979,8 +73436,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72001,35 +73458,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -72048,9 +73505,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72058,15 +73515,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72110,26 +73567,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72150,7 +73607,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -72162,43 +73619,43 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72207,13 +73664,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -72259,26 +73716,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72299,7 +73756,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -72307,31 +73764,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72345,9 +73802,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72356,7 +73813,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -72408,25 +73865,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -72449,7 +73906,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72462,25 +73919,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72494,9 +73951,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72513,7 +73970,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72557,11 +74014,11 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] @@ -72575,7 +74032,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -72597,8 +74054,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72606,46 +74063,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72654,15 +74111,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72706,26 +74163,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72755,7 +74212,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -72855,8 +74312,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -72904,7 +74361,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -73004,8 +74461,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -73044,7 +74501,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -73053,30 +74510,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73090,9 +74547,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73101,7 +74558,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -73153,26 +74610,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73185,7 +74642,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73205,33 +74662,33 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73240,9 +74697,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73250,13 +74707,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -73302,12 +74759,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -73320,8 +74777,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73334,7 +74791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73342,8 +74799,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -73351,46 +74808,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73399,15 +74856,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73451,25 +74908,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -73483,7 +74940,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73491,31 +74948,31 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -73529,7 +74986,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73537,10 +74994,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73548,15 +75005,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73600,26 +75057,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73632,7 +75089,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73640,8 +75097,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -73649,30 +75106,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73685,10 +75142,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73697,13 +75154,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -73749,26 +75206,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73781,7 +75238,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73789,39 +75246,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73834,11 +75291,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73846,7 +75303,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -73898,24 +75355,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -73938,39 +75395,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73984,10 +75441,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73995,8 +75452,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -74047,25 +75504,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -74087,39 +75544,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74133,10 +75590,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74144,15 +75601,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74196,25 +75653,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -74236,39 +75693,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 128 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74282,10 +75739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74293,13 +75750,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -74345,26 +75802,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74404,12 +75861,12 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 32 + LSPA: 8 + LSPB: 16 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -74423,14 +75880,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -74442,15 +75899,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74494,8 +75951,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -74512,8 +75969,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74553,12 +76010,12 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -74572,14 +76029,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -74591,15 +76048,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74643,8 +76100,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -74661,8 +76118,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74683,39 +76140,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74729,10 +76186,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74740,13 +76197,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -74792,26 +76249,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74832,56 +76289,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74889,8 +76346,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -74941,26 +76398,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74998,22 +76455,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75027,9 +76484,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75038,13 +76495,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -75090,25 +76547,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -75130,56 +76587,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75187,15 +76644,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75239,25 +76696,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -75279,56 +76736,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75336,15 +76793,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75388,26 +76845,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75437,7 +76894,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -75537,8 +76994,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75556,7 +77013,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75586,7 +77043,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -75686,8 +77143,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75705,7 +77162,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75735,7 +77192,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -75835,8 +77292,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 469 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75854,7 +77311,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75884,7 +77341,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -75892,22 +77349,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75921,9 +77378,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75932,13 +77389,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -75984,17 +77441,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 470 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -76033,30 +77490,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76070,9 +77527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76081,13 +77538,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -76133,17 +77590,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 471 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 SubGroupA: 16 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -76182,7 +77639,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -76282,8 +77739,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 472 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -76306,51 +77763,51 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -76360,7 +77817,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76368,10 +77825,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76379,13 +77836,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -76431,46 +77888,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 473 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76480,22 +77937,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -76509,18 +77966,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76528,8 +77985,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -76580,12 +78037,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 474 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -76593,33 +78050,33 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76629,47 +78086,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76677,14 +78134,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -76729,96 +78186,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 475 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76826,8 +78283,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -76878,96 +78335,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 476 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76975,8 +78432,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -77027,26 +78484,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 477 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -77059,7 +78516,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77084,22 +78541,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77112,10 +78569,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77124,14 +78581,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -77176,17 +78633,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 478 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 @@ -77195,7 +78652,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -77233,22 +78690,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77262,9 +78719,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77273,13 +78730,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -77325,17 +78782,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 479 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 @@ -77344,27 +78801,27 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77381,23 +78838,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77410,10 +78867,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77422,14 +78879,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -77474,20 +78931,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 480 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppresssNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -77498,22 +78955,22 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77530,23 +78987,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77559,11 +79016,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77571,13 +79028,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -77623,35 +79080,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 481 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 4] + SuppresssNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77661,8 +79118,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77679,7 +79136,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 64 LSPA: 16 @@ -77772,35 +79229,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 482 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppresssNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77810,8 +79267,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77828,7 +79285,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 128 LSPA: 16 @@ -77921,35 +79378,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 483 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppresssNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77959,41 +79416,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78007,10 +79464,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78018,14 +79475,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -78070,26 +79527,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 484 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppresssNoLoadLoop: true + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78102,7 +79559,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78128,21 +79585,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78155,11 +79612,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78167,13 +79624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -78219,18 +79676,18 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 485 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -78251,7 +79708,156 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: [8, 6] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78285,13 +79891,13 @@ LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78304,7 +79910,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -78318,12 +79924,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -78368,8 +79974,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 486 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -78387,7 +79993,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78517,7 +80123,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 487 + SolutionIndex: 497 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78536,7 +80142,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78666,7 +80272,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 488 + SolutionIndex: 498 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78685,7 +80291,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78815,7 +80421,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 489 + SolutionIndex: 499 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78834,7 +80440,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78964,7 +80570,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 490 + SolutionIndex: 500 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78983,7 +80589,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79113,7 +80719,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 491 + SolutionIndex: 501 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -79132,7 +80738,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79262,7 +80868,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 492 + SolutionIndex: 502 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -79281,60 +80887,60 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79347,11 +80953,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79359,15 +80965,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79411,48 +81017,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 493 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79464,26 +81070,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79496,11 +81102,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79508,15 +81114,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79560,26 +81166,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 494 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [4, 8] + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79592,7 +81198,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79600,8 +81206,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79612,44 +81218,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79657,15 +81263,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79709,26 +81315,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 495 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: true - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79749,56 +81355,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79806,13 +81412,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -79858,26 +81464,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 496 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79890,7 +81496,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79898,8 +81504,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79907,47 +81513,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79955,15 +81561,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80007,26 +81613,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 497 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [8, 6] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -80039,7 +81645,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80047,39 +81653,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -80092,11 +81698,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80104,15 +81710,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80156,46 +81762,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 498 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -80205,22 +81811,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 409 LdsNumElementsAlignedA: 64 @@ -80234,18 +81840,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80253,8 +81859,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -80305,12 +81911,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 499 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -80318,33 +81924,33 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] + WorkGroup: [4, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -80353,23 +81959,23 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 409 LdsNumElementsAlignedA: 64 @@ -80390,11 +81996,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80402,8 +82008,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -80454,24 +82060,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 500 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -80503,7 +82109,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -80603,13 +82209,13 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 501 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 SubGroupA: 4 SubGroupB: 4 - SuppresssNoLoadLoop: true + SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -80635,7 +82241,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80652,36 +82258,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80700,7 +82306,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -80708,7 +82314,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80752,13 +82358,13 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 502 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: true + SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -80770,7 +82376,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -80801,7 +82407,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -80901,8 +82507,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 503 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 SubGroupA: 4 @@ -80933,7 +82539,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80950,36 +82556,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80998,15 +82604,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81050,8 +82656,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 504 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -81068,13 +82674,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81088,8 +82693,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81098,48 +82702,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81147,15 +82751,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81199,31 +82801,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 505 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81231,14 +82833,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81247,31 +82848,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81284,11 +82885,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81296,15 +82897,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81348,31 +82947,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 506 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81381,13 +82980,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81396,48 +82994,186 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 LVPA: 8 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81445,20 +83181,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81497,31 +83231,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 507 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81530,13 +83264,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81545,48 +83278,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPB: 1 + LVCA: 32 + LVCB: 256 LVPA: 8 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81594,20 +83323,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81646,31 +83373,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 508 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81678,14 +83405,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81694,48 +83420,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81743,20 +83465,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81795,31 +83515,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 509 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81833,8 +83553,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81843,48 +83562,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81892,14 +83611,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -81944,27 +83661,28 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 510 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81982,39 +83700,39 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82029,9 +83747,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82040,12 +83758,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82089,24 +83807,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 511 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 @@ -82129,38 +83847,38 @@ EdgeType: ShiftPtr FractionalLoad: 1 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82174,10 +83892,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82185,13 +83903,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82235,24 +83953,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 512 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: [4, 8] + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 @@ -82267,42 +83985,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82312,14 +84030,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82332,7 +84050,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -82377,19 +84095,19 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 513 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -82409,42 +84127,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82454,14 +84176,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82469,18 +84191,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82519,25 +84241,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 514 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [4, 8] + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82551,42 +84273,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82596,14 +84318,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82616,7 +84338,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -82661,25 +84383,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 515 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82700,35 +84422,39 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82753,18 +84479,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82803,25 +84529,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 516 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: [4, 8] + SubGroupB: 16 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82842,39 +84568,35 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82889,9 +84611,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82899,18 +84621,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82949,33 +84671,34 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 517 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82987,6 +84710,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83006,21 +84730,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 8 + LVPA: 1 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83035,9 +84755,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83049,14 +84769,19 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83072,6 +84797,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83081,6 +84807,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83095,33 +84822,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 518 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83133,8 +84870,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -83147,26 +84885,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83180,10 +84918,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83191,13 +84929,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83218,6 +84961,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83227,6 +84971,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83241,44 +84986,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 519 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83298,17 +85054,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 8 + LVPA: 1 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83321,11 +85077,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83337,9 +85093,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83360,6 +85121,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83369,6 +85131,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83383,12 +85146,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 520 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -83400,27 +85171,30 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83441,20 +85215,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 4 + LSPB: 4 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83467,7 +85241,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -83479,13 +85253,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83506,6 +85285,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83515,6 +85295,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83529,33 +85310,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 521 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83567,6 +85358,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83585,18 +85377,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83610,10 +85402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83625,9 +85417,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83648,6 +85445,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83657,6 +85455,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83671,33 +85470,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 522 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83709,6 +85518,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83774,6 +85584,11 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83794,6 +85609,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83803,6 +85619,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83817,12 +85634,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 523 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -83835,150 +85660,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3072 - LdsOffsetA: 0 - LdsOffsetB: 1024 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 524 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83992,7 +85676,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84017,18 +85701,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 4 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 8 LVPA: 1 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84041,10 +85725,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84057,9 +85741,9 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84114,14 +85798,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -84135,10 +85819,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84152,13 +85836,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84179,20 +85863,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84205,7 +85885,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -84217,13 +85897,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84233,8 +85913,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84278,31 +85958,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84316,7 +85996,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84342,17 +86022,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84365,11 +86045,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84377,13 +86057,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84438,20 +86118,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -84459,10 +86139,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84476,7 +86156,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84503,20 +86183,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84529,7 +86209,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -84541,13 +86221,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84602,31 +86282,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84762,8 +86442,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84784,7 +86464,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84806,7 +86486,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84834,13 +86514,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84881,8 +86557,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84926,8 +86602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84935,7 +86611,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -84948,7 +86624,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84989,18 +86665,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85014,10 +86690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85025,13 +86701,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85086,20 +86762,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -85107,8 +86783,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85130,7 +86806,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -85158,9 +86834,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85201,8 +86881,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -85246,8 +86926,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85255,7 +86935,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85268,7 +86948,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85284,43 +86964,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85333,7 +87017,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -85346,11 +87030,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85361,13 +87047,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85406,8 +87093,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85415,24 +87102,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85444,47 +87129,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85497,7 +87178,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -85510,11 +87191,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85525,13 +87208,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85570,8 +87254,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85579,24 +87263,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85608,43 +87290,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85657,10 +87339,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85669,13 +87351,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85692,6 +87376,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85730,33 +87415,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85768,43 +87451,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85817,11 +87504,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85829,12 +87516,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85845,13 +87534,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85890,8 +87580,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85899,24 +87589,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85928,43 +87616,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85977,11 +87665,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85990,12 +87678,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86006,12 +87696,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86050,15 +87741,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -86066,17 +87757,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86088,47 +87777,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86141,11 +87826,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86154,12 +87839,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86169,13 +87856,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86214,33 +87902,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86258,7 +87944,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -86278,21 +87964,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86307,9 +87989,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86317,15 +87999,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86335,7 +88017,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -86381,28 +88063,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -86443,15 +88125,15 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 640 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -86468,9 +88150,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86478,15 +88160,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86542,28 +88224,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -86584,7 +88266,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -86604,17 +88286,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86629,9 +88315,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86639,14 +88325,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86657,8 +88343,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86703,8 +88389,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86712,12 +88398,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -86725,7 +88411,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -86868,8 +88554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86890,7 +88576,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87029,169 +88715,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 768 - LdsOffsetA: 0 - LdsOffsetB: 512 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87212,7 +88737,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87252,15 +88777,15 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 768 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -87277,9 +88802,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87287,14 +88812,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -87306,7 +88831,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -87351,8 +88876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87361,11 +88886,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87373,7 +88898,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87512,8 +89037,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87534,7 +89059,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87548,13 +89073,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -87583,12 +89108,8 @@ LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87601,7 +89122,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -87615,12 +89136,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87631,7 +89152,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -87677,8 +89198,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87686,7 +89207,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -87699,174 +89220,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87878,7 +89234,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87904,17 +89260,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 2 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87927,11 +89283,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87939,13 +89295,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88003,8 +89359,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88013,11 +89369,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -88025,9 +89381,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88039,7 +89395,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88065,17 +89421,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 2 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88088,11 +89444,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88100,13 +89456,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88164,8 +89520,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88174,11 +89530,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -88186,9 +89542,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88200,7 +89556,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88226,17 +89582,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88249,11 +89605,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88263,13 +89619,13 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88325,15 +89681,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -88346,10 +89702,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88361,7 +89717,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88381,23 +89737,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 32 + LSPA: 2 LSPB: 4 LVCA: 64 - LVCB: 64 - LVPA: 4 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88410,11 +89766,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88423,14 +89779,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88486,15 +89842,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -88506,11 +89862,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88522,7 +89878,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88542,23 +89898,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88571,11 +89927,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88583,13 +89939,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88602,7 +89958,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -88647,8 +90003,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88657,21 +90013,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88683,7 +90039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88703,23 +90059,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88732,11 +90088,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88744,13 +90100,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88763,7 +90119,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -88808,8 +90164,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88818,21 +90174,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88844,7 +90200,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88864,23 +90220,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 8 + LSPA: 2 + LSPB: 4 LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88893,7 +90249,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -88905,15 +90261,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88969,31 +90325,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89011,37 +90367,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89056,9 +90416,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89066,14 +90426,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -89084,8 +90444,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -89130,28 +90490,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -89166,43 +90526,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89215,11 +90579,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89229,13 +90593,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89245,7 +90609,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89291,31 +90655,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89327,43 +90691,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89376,11 +90744,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89390,13 +90758,11 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89406,8 +90772,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -89452,31 +90818,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89488,43 +90856,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89537,11 +90909,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89549,15 +90921,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89567,7 +90939,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89613,31 +90985,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89649,7 +91021,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89674,22 +91046,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89702,10 +91074,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89716,13 +91088,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89778,14 +91150,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 4] @@ -89799,10 +91171,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89821,7 +91193,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -89839,22 +91211,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89868,9 +91240,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89879,13 +91251,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -89943,8 +91313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89953,10 +91323,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -89968,6 +91338,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89986,7 +91358,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90004,22 +91376,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90033,10 +91405,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90044,12 +91416,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90106,8 +91480,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90116,11 +91490,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90131,8 +91505,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90151,7 +91523,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90169,22 +91541,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90198,9 +91570,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90209,13 +91581,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90273,8 +91643,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90283,10 +91653,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -90298,6 +91668,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90334,22 +91706,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90363,9 +91735,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90374,13 +91746,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90438,8 +91810,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90448,10 +91820,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -90460,7 +91832,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -90481,7 +91853,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90499,22 +91871,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90528,10 +91900,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90541,10 +91913,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90601,8 +91975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90611,11 +91985,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90623,11 +91997,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90646,7 +92018,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90664,22 +92036,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90693,9 +92065,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90704,13 +92076,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90768,8 +92138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90778,10 +92148,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -90790,9 +92160,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90811,7 +92183,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90830,17 +92202,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -90859,9 +92231,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90869,12 +92241,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90931,8 +92305,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90941,11 +92315,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90953,11 +92327,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90976,7 +92348,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90994,22 +92366,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91023,9 +92395,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91034,13 +92406,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -91098,8 +92468,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91108,10 +92478,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -91123,6 +92493,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91159,22 +92531,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91188,9 +92560,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91199,13 +92571,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91263,8 +92635,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91273,10 +92645,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -91324,22 +92696,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91353,9 +92725,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91364,11 +92736,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91426,8 +92798,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91436,10 +92808,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -91456,7 +92828,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91471,7 +92843,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91488,23 +92860,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91518,9 +92891,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91529,13 +92902,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -91547,8 +92918,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91593,8 +92965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91602,11 +92974,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -91618,10 +92990,12 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91629,47 +93003,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91682,11 +93057,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91695,11 +93070,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -91710,6 +93087,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91756,8 +93134,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91765,28 +93143,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91794,7 +93170,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91802,39 +93178,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91847,11 +93224,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91859,13 +93236,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91877,8 +93254,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91923,8 +93301,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91932,14 +93310,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -91947,11 +93325,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91959,7 +93337,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91967,39 +93345,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92012,11 +93391,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92024,12 +93403,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92040,6 +93419,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92086,8 +93466,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92095,22 +93475,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92124,48 +93504,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92178,7 +93554,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92191,12 +93567,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92207,8 +93586,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -92253,8 +93632,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92269,17 +93648,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92297,42 +93674,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 32 + LVCA: 64 LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92347,9 +93720,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92357,15 +93730,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92376,7 +93750,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92422,8 +93796,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92432,19 +93806,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -92484,22 +93858,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 + LSPA: 4 LSPB: 4 - LVCA: 128 + LVCA: 64 LVCB: 64 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92513,9 +93887,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92524,15 +93898,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92544,7 +93919,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -92589,8 +93964,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92599,10 +93974,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -92611,7 +93986,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -92625,44 +94000,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -92679,10 +94054,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92691,13 +94066,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92754,8 +94132,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92764,23 +94142,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92792,44 +94168,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92842,7 +94222,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92855,13 +94235,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -92874,7 +94254,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92920,8 +94300,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92936,15 +94316,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92956,13 +94336,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -92976,30 +94356,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93007,10 +94391,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93018,14 +94402,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93038,8 +94422,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93084,31 +94468,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93120,13 +94504,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -93140,34 +94524,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93175,10 +94555,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93186,14 +94566,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93206,7 +94586,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93252,31 +94632,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93295,58 +94675,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93354,14 +94734,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93375,7 +94753,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93420,31 +94798,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93456,65 +94836,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 8 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93522,13 +94902,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93588,31 +94966,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93624,54 +95004,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93679,10 +95059,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93690,13 +95070,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93711,7 +95089,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -93756,31 +95134,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93792,50 +95172,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93843,10 +95227,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93854,13 +95238,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93874,7 +95256,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93920,31 +95302,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93956,7 +95340,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93976,45 +95360,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94022,12 +95406,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94041,7 +95425,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -94086,33 +95470,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94124,54 +95508,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94180,9 +95564,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94190,12 +95574,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94209,7 +95595,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94254,33 +95640,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94299,7 +95683,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -94318,22 +95702,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94347,10 +95731,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94358,12 +95742,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94422,8 +95808,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94432,11 +95818,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -94447,8 +95833,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94486,22 +95870,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 64 - LSPA: 5 + LSPA: 4 LSPB: 8 - LVCA: 48 + LVCA: 64 LVCB: 32 - LVPA: 3 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94515,9 +95899,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94526,8 +95910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -94590,8 +95974,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94600,10 +95984,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -94612,10 +95996,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -94635,7 +96019,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -94644,7 +96028,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94655,21 +96039,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 64 LSPA: 8 - LSPB: 5 + LSPB: 8 LVCA: 32 - LVCB: 48 + LVCB: 32 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94684,9 +96068,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94694,12 +96078,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94758,8 +96144,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94768,11 +96154,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -94783,8 +96169,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94796,7 +96180,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94804,40 +96188,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94850,11 +96234,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94862,14 +96246,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94883,7 +96267,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -94928,8 +96312,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94938,21 +96322,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94973,7 +96357,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94986,43 +96370,43 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 32 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95030,14 +96414,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95096,29 +96480,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -95132,54 +96516,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95187,10 +96571,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95198,12 +96582,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95262,37 +96648,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95300,7 +96684,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95308,46 +96692,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95355,10 +96739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95366,14 +96750,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95432,31 +96816,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95468,7 +96852,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95476,57 +96860,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95534,14 +96918,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95600,35 +96984,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95636,7 +97020,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95644,7 +97028,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -95652,49 +97036,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95702,13 +97086,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -95768,31 +97152,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95804,7 +97188,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95812,7 +97196,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -95820,32 +97204,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95858,11 +97242,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95870,14 +97254,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95891,7 +97275,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95936,35 +97320,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95972,7 +97356,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95992,34 +97376,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96027,10 +97411,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96038,14 +97422,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96104,31 +97488,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96140,7 +97524,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96148,46 +97532,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 16 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96195,10 +97579,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96206,14 +97590,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96272,35 +97656,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96308,44 +97692,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96355,7 +97739,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96363,10 +97747,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96374,13 +97758,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -96395,7 +97777,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -96440,31 +97822,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96485,7 +97869,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96498,22 +97882,22 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96523,27 +97907,29 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -96592,6 +97978,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96608,15 +97995,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -96629,8 +98016,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -96644,7 +98031,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96652,36 +98039,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96698,26 +98085,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96760,6 +98149,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96776,8 +98166,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96786,21 +98176,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96812,44 +98202,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96866,25 +98256,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -96928,6 +98318,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96944,8 +98335,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96954,21 +98345,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96980,7 +98373,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96988,36 +98381,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -97034,23 +98427,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -97065,7 +98460,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -97094,6 +98489,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -97110,8 +98506,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97120,21 +98516,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -97148,48 +98544,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97202,11 +98598,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97216,14 +98612,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97283,8 +98677,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97293,21 +98687,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97319,48 +98715,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97373,11 +98769,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97387,14 +98783,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97454,8 +98848,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97464,21 +98858,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97490,7 +98886,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97498,40 +98894,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97544,11 +98940,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97558,12 +98954,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97623,8 +99019,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97633,23 +99029,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97661,48 +99057,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97715,11 +99111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97729,12 +99125,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97794,8 +99192,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97804,23 +99202,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97859,17 +99255,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -97888,9 +99284,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97900,8 +99296,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -97919,7 +99315,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -97965,8 +99361,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97975,11 +99371,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -97991,7 +99387,7 @@ WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98030,17 +99426,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -98059,9 +99455,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98071,8 +99467,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -98136,8 +99532,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98146,11 +99542,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98158,11 +99554,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98181,7 +99577,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -98200,22 +99596,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 - LVPA: 4 - LVPB: 3 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98229,10 +99625,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98242,12 +99638,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98307,8 +99705,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98317,11 +99715,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98329,11 +99727,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98371,22 +99767,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98400,10 +99796,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98415,12 +99811,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98480,8 +99876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98490,11 +99886,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98502,7 +99898,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -98542,22 +99938,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98571,10 +99967,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98588,8 +99984,8 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98649,8 +100045,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98659,11 +100055,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98694,41 +100090,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98742,9 +100138,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98755,12 +100151,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98820,8 +100218,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98830,23 +100228,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98866,36 +100262,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -98914,9 +100310,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98926,14 +100322,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98947,7 +100343,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98993,8 +100389,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99003,19 +100399,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -99029,7 +100425,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99037,56 +100433,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99097,14 +100493,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99164,31 +100560,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99200,64 +100596,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99268,12 +100664,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99333,33 +100731,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99371,65 +100767,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99439,14 +100835,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99506,31 +100900,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99542,65 +100938,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99610,14 +101006,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99631,7 +101025,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -99677,31 +101071,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99739,22 +101135,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -99768,10 +101164,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99783,12 +101179,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99848,15 +101244,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -99869,8 +101265,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -99891,7 +101287,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99910,22 +101306,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -99939,10 +101335,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99954,12 +101350,10 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100019,15 +101413,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -100040,14 +101434,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100055,48 +101451,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -100109,25 +101505,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -100171,7 +101567,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100188,15 +101583,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -100204,17 +101599,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100233,73 +101626,73 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100342,7 +101735,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100359,33 +101751,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100397,7 +101787,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100405,36 +101795,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -100444,7 +101834,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100452,27 +101842,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100515,7 +101903,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100532,35 +101919,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100568,61 +101955,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -100630,14 +102017,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -100684,7 +102071,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100701,37 +102087,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100759,44 +102143,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100805,13 +102189,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -100871,29 +102255,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -100914,7 +102298,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -100934,17 +102318,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -100963,9 +102347,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100973,14 +102357,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101039,8 +102421,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101049,11 +102431,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -101061,9 +102443,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101082,58 +102466,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101141,14 +102525,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101207,35 +102589,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101243,7 +102627,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101263,7 +102647,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -101278,19 +102662,19 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101310,13 +102694,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101375,31 +102759,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101419,57 +102803,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101477,14 +102861,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101543,35 +102927,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101579,40 +102963,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -101626,18 +103010,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101645,12 +103029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101709,15 +103095,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -101725,17 +103111,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101754,41 +103138,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101802,10 +103186,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101814,10 +103198,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -101877,15 +103263,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -101893,17 +103279,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101922,41 +103306,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101970,10 +103354,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101983,11 +103367,9 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -102047,15 +103429,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102063,15 +103445,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102083,7 +103467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102091,36 +103475,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102130,18 +103514,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102149,13 +103533,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -102215,35 +103599,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102258,31 +103642,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -102318,13 +103702,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102383,8 +103765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102399,15 +103781,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102419,7 +103803,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102439,24 +103823,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102466,7 +103850,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102474,9 +103858,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102485,14 +103869,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102551,14 +103935,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -102571,11 +103955,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102594,57 +103978,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102653,12 +104037,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102717,14 +104103,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -102733,17 +104119,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102755,14 +104139,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -102771,7 +104155,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102781,28 +104165,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102810,10 +104194,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102821,13 +104205,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -102887,31 +104269,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102923,14 +104307,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102949,14 +104333,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -102970,18 +104354,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102989,8 +104373,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -103053,15 +104439,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103074,12 +104460,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103091,7 +104475,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103099,46 +104483,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -103146,10 +104530,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103157,14 +104541,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103223,15 +104607,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103239,15 +104623,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103266,31 +104650,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -103327,12 +104711,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103391,8 +104773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103407,7 +104789,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -103416,6 +104798,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103427,48 +104811,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103481,7 +104865,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -103495,10 +104879,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103557,8 +104943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103573,7 +104959,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -103581,9 +104967,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103621,38 +105005,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103661,13 +105045,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -103727,14 +105111,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -103748,7 +105132,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -103763,7 +105147,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103790,27 +105174,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -103819,9 +105203,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103829,13 +105213,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -103895,15 +105279,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103916,10 +105300,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103938,41 +105322,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103986,10 +105370,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103997,12 +105381,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104061,8 +105447,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104071,23 +105457,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104099,7 +105483,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104115,7 +105499,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -104125,39 +105509,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104165,14 +105549,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104231,31 +105615,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104267,7 +105651,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104275,40 +105659,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104321,7 +105705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -104335,12 +105719,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104399,8 +105783,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104415,7 +105799,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -104423,7 +105807,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104435,7 +105819,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104451,7 +105835,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -104462,27 +105846,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -104491,9 +105875,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104501,13 +105885,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -104567,15 +105951,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -104588,10 +105972,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104603,54 +105987,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -104658,10 +106042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104669,14 +106053,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104735,31 +106117,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104787,49 +106171,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104837,14 +106221,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104903,28 +106287,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -104939,48 +106323,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104993,7 +106377,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -105007,12 +106391,10 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105071,8 +106453,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105087,7 +106469,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -105095,7 +106477,9 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105107,7 +106491,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105115,46 +106499,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105162,10 +106546,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105173,14 +106557,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105239,35 +106623,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105282,58 +106666,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 32 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105341,11 +106725,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -105405,33 +106791,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105443,7 +106827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105451,46 +106835,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105498,10 +106882,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105509,13 +106893,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -105575,31 +106959,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105611,64 +106995,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105677,11 +107061,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -105741,14 +107127,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -105761,13 +107147,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105779,7 +107163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105799,45 +107183,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105845,14 +107229,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105911,35 +107295,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105947,7 +107331,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105967,34 +107351,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -106002,10 +107386,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106013,13 +107397,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106079,31 +107463,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106122,7 +107506,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106141,22 +107525,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106170,9 +107554,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106181,13 +107565,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106247,8 +107629,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106257,10 +107639,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106269,9 +107651,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106283,14 +107667,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106309,22 +107693,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106337,10 +107721,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106349,14 +107733,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -106415,8 +107797,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106425,10 +107807,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106437,9 +107819,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106477,39 +107861,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106517,14 +107901,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -106583,28 +107967,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106619,7 +108003,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106627,46 +108011,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -106674,9 +108058,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106685,13 +108069,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106751,14 +108135,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -106767,15 +108151,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106787,14 +108171,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106813,14 +108197,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -106834,18 +108218,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106853,8 +108237,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106917,15 +108303,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -106938,12 +108324,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106955,54 +108339,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107010,10 +108394,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107021,12 +108405,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107085,33 +108471,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107123,7 +108507,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107149,22 +108533,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107177,11 +108561,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107192,11 +108576,11 @@ NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107255,15 +108639,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -107276,10 +108660,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107291,7 +108675,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107299,46 +108683,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107347,9 +108731,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107357,14 +108741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107423,15 +108807,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -107439,15 +108823,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107459,7 +108843,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107467,32 +108851,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -107513,11 +108897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107525,14 +108909,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107591,8 +108975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107601,13 +108985,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -107615,7 +108999,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107759,8 +109143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107781,13 +109165,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107815,7 +109199,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -107830,7 +109214,7 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 @@ -107842,14 +109226,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -107862,7 +109246,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -107927,28 +109311,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -107971,30 +109355,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -108030,13 +109414,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108095,8 +109479,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108111,11 +109495,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -108123,7 +109507,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108131,7 +109515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108151,34 +109535,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108186,10 +109570,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108197,13 +109581,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -108263,31 +109647,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -108319,28 +109703,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108354,10 +109738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108365,14 +109749,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108431,8 +109815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108441,25 +109825,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108467,7 +109851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108487,45 +109871,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108533,14 +109917,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108599,15 +109983,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -108619,11 +110003,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -108635,54 +110019,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108690,10 +110074,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108701,10 +110085,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -108767,15 +110149,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -108787,15 +110169,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108803,54 +110187,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108858,9 +110242,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -108869,14 +110253,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108935,35 +110317,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108971,7 +110355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108991,34 +110375,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109026,10 +110410,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109037,14 +110421,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109103,15 +110487,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -109123,11 +110507,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109139,7 +110523,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109165,39 +110549,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109205,14 +110589,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109271,35 +110655,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109307,54 +110691,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109362,10 +110746,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109373,12 +110757,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109437,33 +110823,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109475,54 +110859,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109543,9 +110927,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -109605,37 +110991,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109643,7 +111027,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109663,34 +111047,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109698,10 +111082,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109709,13 +111093,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -109775,31 +111159,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109818,41 +111202,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109866,10 +111250,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109877,14 +111261,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109943,8 +111325,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109953,25 +111335,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109979,7 +111363,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109987,46 +111371,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 16 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110034,10 +111418,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110045,14 +111429,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110111,31 +111495,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110147,7 +111531,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110156,56 +111540,56 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCA: 16 + LSCB: 64 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 6656 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110213,14 +111597,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110279,31 +111663,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110315,7 +111699,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110323,46 +111707,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110370,10 +111754,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110381,13 +111765,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -110447,31 +111831,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110490,9 +111874,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110503,28 +111887,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110538,10 +111922,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110549,8 +111933,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -110613,8 +111999,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110623,23 +112009,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110651,7 +112035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110667,7 +112051,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -110677,28 +112061,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110706,9 +112090,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -110717,14 +112101,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110783,14 +112167,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -110804,10 +112188,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110819,7 +112203,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110827,40 +112211,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 16 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110873,11 +112257,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110887,12 +112271,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110951,31 +112335,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110994,57 +112378,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111053,14 +112437,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111119,31 +112501,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111155,7 +112539,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111163,57 +112547,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111221,14 +112605,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111287,31 +112671,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111323,54 +112707,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -111379,9 +112763,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111389,14 +112773,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111455,15 +112837,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -111475,11 +112857,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111498,57 +112882,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111557,13 +112941,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -111623,14 +113005,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -111639,15 +113021,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111666,7 +113050,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -111727,6 +113111,8 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -111789,7 +113175,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 691 + SolutionIndex: 699 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -111814,8 +113200,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111853,38 +113237,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111893,13 +113277,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -111959,14 +113343,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -111980,7 +113364,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -112002,31 +113386,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -112063,10 +113447,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112125,8 +113511,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112141,7 +113527,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -112150,12 +113536,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112163,44 +113547,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -112210,18 +113594,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112229,12 +113613,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112293,33 +113679,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112331,7 +113715,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112339,46 +113723,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112387,9 +113771,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112397,14 +113781,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112463,15 +113847,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -112483,11 +113867,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112499,7 +113883,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112508,7 +113892,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112521,22 +113905,22 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -112546,7 +113930,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112554,10 +113938,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112568,11 +113952,11 @@ NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112631,31 +114015,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112667,7 +114051,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112675,36 +114059,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 8 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -112714,7 +114098,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112722,9 +114106,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112733,14 +114117,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112799,14 +114183,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -112815,19 +114199,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112855,7 +114239,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -112902,7 +114286,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -112967,8 +114351,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112987,9 +114371,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -113003,7 +114387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113029,14 +114413,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -113050,18 +114434,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113069,8 +114453,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -113135,15 +114519,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -113156,178 +114540,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 2 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 2 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113346,7 +114562,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113407,8 +114623,6 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -113471,7 +114685,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 701 + SolutionIndex: 708 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -113496,6 +114710,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113507,7 +114723,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113515,40 +114731,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113561,7 +114777,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -113574,13 +114790,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113639,8 +114855,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113655,15 +114871,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113682,20 +114898,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -113703,10 +114919,10 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -113742,13 +114958,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113807,8 +115021,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113827,11 +115041,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113843,7 +115059,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113869,22 +115085,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113897,11 +115113,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113914,7 +115130,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113973,15 +115189,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -113994,10 +115210,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -114011,7 +115227,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114037,28 +115253,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114066,9 +115282,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114077,14 +115293,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114143,14 +115359,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -114164,10 +115380,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114186,41 +115402,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114235,9 +115451,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114245,12 +115461,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114309,8 +115527,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114319,23 +115537,21 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114354,7 +115570,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -114373,39 +115589,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114415,10 +115631,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114477,20 +115695,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114498,12 +115716,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114523,40 +115739,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114570,9 +115786,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114581,14 +115797,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114647,8 +115863,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114657,19 +115873,19 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -114709,22 +115925,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114738,10 +115954,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114751,12 +115967,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114815,8 +116031,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114825,11 +116041,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -114837,7 +116053,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -114851,7 +116067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114859,46 +116075,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114907,23 +116123,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -114938,7 +116156,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -114967,12 +116185,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -114983,31 +116203,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115019,61 +116239,57 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115081,18 +116297,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -115105,7 +116323,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115135,12 +116353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -115151,31 +116371,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115187,7 +116407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115195,46 +116415,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -115243,23 +116463,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -115303,12 +116525,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -115319,31 +116543,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115444,7 +116668,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -115491,8 +116715,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115527,7 +116751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115562,9 +116786,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115577,7 +116801,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115593,12 +116817,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -115659,8 +116883,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115683,7 +116907,175 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115715,7 +117107,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -115764,7 +117156,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -115784,7 +117176,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115831,8 +117223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115851,7 +117243,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -115873,7 +117265,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -115887,7 +117279,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -115902,13 +117294,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115936,7 +117324,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -115955,8 +117343,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116003,8 +117391,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116023,7 +117411,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -116039,13 +117427,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -116059,7 +117447,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -116075,8 +117463,12 @@ LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116089,7 +117481,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116104,13 +117496,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116123,8 +117515,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116171,8 +117563,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116191,11 +117583,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116207,7 +117599,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116227,7 +117619,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -116242,9 +117634,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116257,7 +117649,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116272,13 +117664,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116339,8 +117731,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116359,11 +117751,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116464,7 +117856,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116511,8 +117903,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116533,7 +117925,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -116547,7 +117939,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116582,9 +117974,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116597,7 +117989,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116613,12 +118005,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116632,7 +118024,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116679,8 +118071,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116701,9 +118093,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116715,7 +118107,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116750,13 +118142,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116769,7 +118161,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116785,12 +118177,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116804,7 +118196,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116851,8 +118243,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116873,9 +118265,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116887,7 +118279,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116922,9 +118314,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116937,7 +118329,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116953,12 +118345,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117019,8 +118411,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117041,9 +118433,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117063,30 +118455,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -117124,13 +118516,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117191,8 +118583,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117207,13 +118599,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -117227,44 +118619,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117277,7 +118673,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117292,13 +118688,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117311,7 +118707,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117359,8 +118755,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117375,15 +118771,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117395,7 +118791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117403,40 +118799,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117449,7 +118845,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117464,13 +118860,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117531,8 +118927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117547,15 +118943,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117567,44 +118963,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117617,7 +119017,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117632,13 +119032,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117651,7 +119051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117699,8 +119099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117715,15 +119115,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117735,7 +119135,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117743,19 +119143,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -117763,20 +119163,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117789,7 +119189,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117804,7 +119204,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -117871,8 +119271,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117891,11 +119291,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117907,7 +119307,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117915,19 +119315,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -117935,20 +119335,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117961,7 +119361,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117976,7 +119376,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -118043,8 +119443,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118063,11 +119463,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118086,7 +119486,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -118095,7 +119495,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -118149,8 +119549,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -118215,8 +119613,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118237,9 +119635,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118251,16 +119651,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118271,7 +119671,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -118279,20 +119679,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118305,7 +119705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -118320,9 +119720,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -118387,8 +119785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118407,11 +119805,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118423,7 +119823,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118431,40 +119831,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118477,10 +119877,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118491,14 +119891,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -118559,8 +119959,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118569,21 +119969,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118595,7 +119995,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118603,40 +120003,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118649,10 +120049,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118663,14 +120063,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -118731,8 +120131,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118741,21 +120141,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118793,22 +120193,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 8 + LSPA: 5 LSPB: 8 - LVCA: 32 + LVCA: 48 LVCB: 32 - LVPA: 4 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118822,9 +120222,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118835,11 +120235,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -118901,8 +120301,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118911,10 +120311,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -118923,10 +120323,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -118939,7 +120339,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118947,40 +120347,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118993,10 +120393,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119007,11 +120407,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -119073,8 +120473,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119083,22 +120483,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -119137,22 +120537,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119166,10 +120566,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119181,12 +120581,12 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119247,8 +120647,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119257,11 +120657,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -119269,7 +120669,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -119290,41 +120690,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 4 + LSPB: 5 LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119338,10 +120738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119353,9 +120753,7 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -119419,8 +120817,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119429,21 +120827,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119481,22 +120881,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119510,10 +120910,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119527,8 +120927,8 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119589,8 +120989,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119599,11 +120999,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -119611,11 +121011,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119634,41 +121034,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119682,10 +121082,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119695,12 +121095,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119714,7 +121116,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -119761,8 +121163,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119771,23 +121173,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119819,24 +121219,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 4 - LSPB: 8 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -119855,9 +121255,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119867,14 +121267,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119888,7 +121288,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -119935,8 +121335,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119945,17 +121345,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -119978,7 +121378,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119998,17 +121398,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -120027,9 +121427,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120039,8 +121439,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120058,7 +121460,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120105,8 +121507,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120115,11 +121517,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -120130,8 +121532,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120150,7 +121550,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -120170,17 +121570,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -120199,9 +121599,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120211,8 +121611,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120230,7 +121632,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120277,8 +121679,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120287,11 +121689,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -120302,8 +121704,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120323,30 +121723,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -120384,13 +121784,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -120404,7 +121804,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -120451,8 +121851,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120467,187 +121867,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120748,7 +121976,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -120795,8 +122023,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120817,7 +122045,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -120831,27 +122059,27 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -120859,20 +122087,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120885,7 +122109,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -120900,7 +122124,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -120919,7 +122143,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -120967,8 +122191,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120987,11 +122211,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121010,7 +122234,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -121073,8 +122297,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121139,8 +122361,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121164,6 +122386,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121182,7 +122406,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -121245,8 +122469,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121311,8 +122533,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121336,6 +122558,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121347,44 +122571,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121397,11 +122625,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121413,11 +122641,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121431,7 +122659,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -121479,8 +122707,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121489,21 +122717,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121522,41 +122750,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 32 + LVCA: 128 LVCB: 64 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121570,10 +122798,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121584,10 +122812,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121602,7 +122832,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -121649,8 +122879,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121659,23 +122889,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121694,41 +122922,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 32 + LVCA: 128 LVCB: 64 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121742,10 +122970,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121756,10 +122984,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121821,8 +123051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121831,23 +123061,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121867,30 +123095,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false @@ -121928,13 +123156,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -121995,8 +123223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122011,11 +123239,11 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -122038,31 +123266,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false @@ -122100,13 +123328,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122120,7 +123346,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122167,8 +123393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122183,15 +123409,17 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122203,13 +123431,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122223,28 +123451,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 2 + LSPA: 8 LSPB: 4 - LVCA: 128 + LVCA: 32 LVCB: 64 - LVPA: 2 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122257,10 +123481,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122271,14 +123495,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122291,7 +123515,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122339,8 +123563,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122349,21 +123573,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122375,48 +123599,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122429,10 +123649,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122443,14 +123663,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122463,8 +123683,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122511,8 +123731,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122521,21 +123741,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122554,8 +123774,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -122563,32 +123783,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122602,9 +123822,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122615,11 +123835,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -122681,8 +123903,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122691,10 +123913,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -122706,8 +123928,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122725,7 +123945,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122754,22 +123974,26 @@ LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -122803,7 +124027,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122851,20 +124075,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -122872,7 +124096,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -122893,7 +124117,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122907,7 +124131,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -122922,22 +124146,26 @@ LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -122952,7 +124180,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -122971,7 +124199,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -123019,29 +124247,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -123064,7 +124292,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123077,43 +124305,43 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 512 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123123,8 +124351,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123191,15 +124419,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -123207,13 +124435,13 @@ ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -123234,41 +124462,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -123283,9 +124511,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123295,14 +124523,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123363,31 +124589,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123425,22 +124653,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -123454,10 +124682,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123469,12 +124697,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123535,15 +124763,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -123556,7 +124784,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123571,7 +124799,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123587,48 +124815,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123639,14 +124867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123707,31 +124935,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123743,7 +124971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123759,7 +124987,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -123769,28 +124997,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -123798,9 +125026,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123811,12 +125039,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123877,31 +125105,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -123924,56 +125152,56 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123983,14 +125211,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124051,15 +125279,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124067,13 +125295,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124107,28 +125335,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124142,10 +125370,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124155,14 +125383,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124223,28 +125451,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -124259,16 +125487,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124281,26 +125509,26 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124313,11 +125541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124329,10 +125557,12 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124393,33 +125623,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124439,8 +125667,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124452,27 +125680,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 64 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124486,10 +125714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124567,15 +125795,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124588,8 +125816,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124603,7 +125831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124611,7 +125839,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -124619,28 +125847,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -124657,10 +125885,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124671,14 +125899,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124739,8 +125967,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124749,21 +125977,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124775,16 +126003,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124797,26 +126025,26 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124829,10 +126057,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124843,13 +126071,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -124911,8 +126137,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124921,10 +126147,10 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124935,7 +126161,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124947,15 +126175,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -124963,49 +126191,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125015,15 +126239,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125035,7 +126259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125083,15 +126307,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125099,15 +126323,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125119,15 +126343,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -125135,49 +126359,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125187,15 +126407,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125207,7 +126427,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125255,15 +126475,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125271,15 +126491,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125291,14 +126511,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -125318,38 +126538,34 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125359,13 +126575,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125377,7 +126595,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125425,15 +126643,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125446,12 +126664,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125469,7 +126685,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -125483,24 +126699,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 2 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125515,9 +126735,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125527,14 +126747,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125547,8 +126767,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125595,8 +126815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125605,17 +126825,17 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -125637,7 +126857,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -125658,17 +126878,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 2 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125683,9 +126907,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125695,14 +126919,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125715,8 +126939,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125763,8 +126987,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125773,11 +126997,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -125785,7 +127009,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -125799,44 +127023,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125849,11 +127077,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125863,14 +127091,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125883,8 +127111,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125931,8 +127159,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125941,21 +127169,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125975,30 +127203,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126036,13 +127264,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126103,8 +127331,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126119,11 +127347,11 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -126147,30 +127375,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126209,12 +127437,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126275,8 +127503,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126291,13 +127519,13 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -126318,31 +127546,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126381,12 +127609,10 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126447,8 +127673,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126463,7 +127689,7 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -126472,6 +127698,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126490,7 +127718,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126553,8 +127781,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -126572,7 +127798,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126619,8 +127845,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126641,9 +127867,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126656,47 +127884,211 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -126709,10 +128101,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126723,14 +128115,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126743,8 +128135,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126791,31 +128183,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126835,40 +128227,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -126882,9 +128274,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126895,12 +128287,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126914,7 +128306,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126961,29 +128353,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -126999,48 +128391,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127053,10 +128441,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127067,12 +128455,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127085,7 +128473,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -127133,31 +128521,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127173,8 +128561,8 @@ CheckTensorDimAsserts: false DepthU: 8 DirectToLds: true - DirectToLdsA: true - DirectToLdsB: false + DirectToLdsA: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -127197,18 +128585,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 - LVCA: 128 - LVCB: 32 - LVPA: 1 - LVPB: 4 + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 LdcEqualsLdd: false LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127217,15 +128605,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true - LocalWriteUseSgprB: false + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127237,12 +128625,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127303,29 +128691,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -127339,44 +128727,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127389,10 +128781,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127403,14 +128795,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127423,8 +128815,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127471,8 +128863,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127481,21 +128873,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127514,41 +128906,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127562,9 +128954,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127575,12 +128967,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127594,7 +128988,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127641,8 +129035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127651,23 +129045,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127679,44 +129071,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127729,10 +129125,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127743,12 +129139,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127761,8 +129157,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127809,8 +129205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127819,21 +129215,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127848,43 +129244,47 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 128 LSPA: 4 - LSPB: 1 + LSPB: 2 LVCA: 32 - LVCB: 128 - LVPA: 4 + LVCB: 64 + LVPA: 2 LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127894,13 +129294,13 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127911,14 +129311,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127931,7 +129329,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -127979,8 +129377,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127989,21 +129387,23 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128022,7 +129422,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128042,21 +129442,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128071,9 +129471,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128083,14 +129483,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128151,8 +129549,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128161,11 +129559,11 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -128173,9 +129571,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128195,32 +129595,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -128255,15 +129655,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128319,33 +129719,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -128366,33 +129768,33 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -128427,13 +129829,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128489,37 +129893,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128531,48 +129935,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 - LVCA: 32 + LSPB: 4 + LVCA: 64 LVCB: 64 - LVPA: 2 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128585,11 +129985,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128599,13 +129999,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128617,7 +130019,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -128661,37 +130063,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128703,48 +130105,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 2 - LVPB: 1 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128757,11 +130159,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128771,13 +130173,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128790,7 +130194,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128833,37 +130237,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128895,28 +130299,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128931,9 +130335,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128943,14 +130347,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128964,7 +130368,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129013,8 +130417,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129023,19 +130427,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -129076,21 +130480,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129105,9 +130509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129117,14 +130521,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129187,8 +130591,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129197,11 +130601,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -129209,7 +130613,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -129223,13 +130627,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -129250,17 +130654,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129273,11 +130681,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129287,13 +130695,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -129307,8 +130715,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -129357,8 +130765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129367,11 +130775,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -129379,9 +130787,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129393,48 +130801,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129447,11 +130855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129461,14 +130869,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129482,7 +130888,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -129531,8 +130937,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129541,21 +130947,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129574,31 +130982,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -129637,12 +131045,10 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129705,8 +131111,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129721,15 +131127,17 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129748,31 +131156,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -129810,13 +131218,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129830,7 +131236,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129879,8 +131285,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129895,15 +131301,17 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129915,13 +131323,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -129941,22 +131349,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129969,11 +131373,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129985,11 +131389,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -130003,8 +131407,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130053,8 +131457,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130063,11 +131467,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130075,9 +131479,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130096,41 +131500,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130145,9 +131549,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130158,12 +131562,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130225,15 +131631,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -130245,13 +131651,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130270,41 +131674,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130319,9 +131723,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130332,12 +131736,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130350,7 +131756,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -130399,15 +131805,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -130419,13 +131825,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130438,47 +131842,43 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 32 + LSPA: 1 LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130487,15 +131887,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130506,12 +131906,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130523,7 +131925,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -130573,33 +131975,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130611,44 +132011,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130661,10 +132065,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130677,13 +132081,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130695,8 +132099,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -130745,14 +132149,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -130761,15 +132165,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130788,7 +132192,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130851,8 +132255,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -130919,31 +132321,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130963,21 +132367,22 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly @@ -130985,10 +132390,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -131023,7 +132428,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -131031,20 +132436,22 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -131093,28 +132500,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -131130,43 +132537,48 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 - LVCA: 128 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 LVCB: 32 - LVPA: 1 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131175,15 +132587,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131193,27 +132605,27 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -131263,31 +132675,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT6_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131307,40 +132721,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 8 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131354,9 +132769,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131368,27 +132783,29 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -131437,14 +132854,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -131457,9 +132874,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -131481,40 +132898,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 8 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131528,9 +132946,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131542,18 +132960,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -131609,14 +133029,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -131629,15 +133049,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -131647,28 +133067,28 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 @@ -131676,20 +133096,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131702,7 +133122,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -131711,15 +133131,12 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -131788,8 +133205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131808,13 +133225,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -131833,40 +133252,40 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 + LSCA: 64 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 48 + LVCA: 32 LVCB: 32 - LVPA: 3 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131880,24 +133299,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -131963,8 +133381,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT6_4_USFGRO0_VW2_WGM8 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131973,25 +133391,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132008,42 +133426,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCA: 64 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132057,26 +133475,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -132142,8 +133557,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132152,23 +133567,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132186,41 +133603,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 32 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132234,26 +133651,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -132268,7 +133684,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -132317,14 +133733,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -132337,15 +133753,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132355,49 +133771,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132410,7 +133826,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -132425,10 +133841,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132444,7 +133862,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -132493,8 +133911,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132509,19 +133927,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132538,42 +133954,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132588,9 +134004,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132599,12 +134015,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132620,7 +134038,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -132669,8 +134087,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132679,25 +134097,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132707,49 +134123,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132762,11 +134174,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132775,12 +134187,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132795,7 +134209,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -132845,8 +134259,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132855,25 +134269,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132884,48 +134296,44 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132934,15 +134342,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132953,10 +134361,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132971,7 +134381,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133021,15 +134431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_AMAS3_DTL1_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -133037,17 +134447,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133059,49 +134467,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133114,7 +134523,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -133128,9 +134537,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -133150,13 +134557,14 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133180,6 +134588,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133199,8 +134608,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133215,15 +134624,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133239,45 +134650,46 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133292,9 +134704,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133303,13 +134715,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -133326,13 +134736,14 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133356,6 +134767,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133375,8 +134787,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133385,21 +134797,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133415,41 +134829,42 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133463,9 +134878,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133475,14 +134890,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -133505,6 +134920,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133528,6 +134944,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133547,8 +134964,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_NLCA1_NLCB1_PGR0_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133557,19 +134974,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -133584,12 +135001,13 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -133603,7 +135021,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -133611,17 +135029,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 + LSCB: 128 + LSPA: 2 + LSPB: 2 LVCA: 128 - LVCB: 32 - LVPA: 1 - LVPB: 4 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133630,15 +135052,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133647,15 +135069,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -133669,7 +135091,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133677,6 +135099,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133700,6 +135123,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133719,29 +135143,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_AMAS3_DTL1_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - [2, 3, 0, 1] @@ -138979,5220 +140403,5236 @@ - [400, 8824.43] - - [784, 128, 64, 512] - [402, 9393.09] + - - [65, 1024, 1, 6400] + - [413, 3556.98] + - - [256, 4096, 1, 6400] + - [414, 10132.4] + - - [1024, 4096, 1, 64] + - [415, 6918.44] + - - [1024, 4096, 1, 6336] + - [416, 10393.9] - - [1024, 128, 1, 128] - - [417, 1028.02] + - [421, 1028.02] - - [4, 704, 1, 1280] - - [456, 363.355] + - [460, 363.355] - - [4, 1856, 1, 3328] - - [456, 579.434] + - [460, 579.434] - - [1856, 448, 1, 3328] - - [493, 6966.73] + - [497, 6966.73] - - [2944, 4288, 1, 1280] - - [488, 9057.88] + - [492, 9057.88] - - [2368, 64, 1, 3328] - - [449, 5837.56] + - [453, 5837.56] - - [2368, 5888, 1, 256] - - [493, 9111.06] + - [497, 9111.06] - - [128, 64, 1, 256] - - [455, 374.491] + - [459, 374.491] - - [5888, 1024, 1, 1280] - - [498, 8570.44] + - [502, 8570.44] - - [128, 6784, 1, 3328] - - [461, 7703.86] + - [465, 7703.86] - - [64, 4, 1, 256] - - [507, 11.2219] + - [511, 11.2219] - - [5888, 1856, 1, 3328] - - [493, 9394.3] + - [497, 9394.3] - - [5056, 704, 1, 256] - - [496, 8026.89] + - [500, 8026.89] - - [5888, 2944, 1, 3328] - - [486, 7608.11] + - [490, 7608.11] - - [1856, 4288, 1, 256] - - [487, 8986.32] + - [491, 8986.32] - - [1024, 5056, 1, 128] - - [479, 3898.24] + - [483, 3898.24] - - [5056, 5056, 1, 3328] - - [487, 9536.75] + - [491, 9536.75] - - [1408, 5888, 1, 1280] - - [488, 9279.09] + - [492, 9279.09] - - [2368, 448, 1, 128] - - [480, 2474.32] + - [484, 2474.32] - - [1024, 3584, 1, 3328] - - [490, 9258.48] + - [494, 9258.48] - - [4, 2944, 1, 1280] - - [442, 611.74] + - [446, 611.74] - - [1408, 64, 1, 128] - - [413, 858.21] + - [417, 858.21] - - [256, 4288, 1, 3328] - - [493, 7615.98] + - [497, 7615.98] - - [5888, 1408, 1, 1280] - - [486, 9620.29] + - [490, 9620.29] - - [704, 1856, 1, 3328] - - [487, 9033.65] + - [491, 9033.65] - - [4, 1408, 1, 128] - - [500, 24.355] + - [504, 24.355] - - [1024, 2368, 1, 256] - - [487, 7526.15] + - [491, 7526.15] - - [1408, 1856, 1, 1280] - - [490, 8324.09] + - [494, 8324.09] - - [1408, 64, 1, 1280] - - [461, 4681.14] + - [465, 4681.14] - - [448, 1024, 1, 1280] - - [487, 7112.43] + - [491, 7112.43] - - [256, 1408, 1, 3328] - - [493, 5825.41] + - [497, 5825.41] - - [5056, 5056, 1, 1280] - - [496, 9233.55] + - [500, 9233.55] - - [448, 5056, 1, 256] - - [488, 7003.17] + - [492, 7003.17] - - [704, 1856, 1, 1280] - - [487, 8877.28] + - [491, 8877.28] - - [128, 5056, 1, 128] - - [479, 2301.04] + - [483, 2301.04] - - [2368, 128, 1, 256] - - [487, 3848.94] + - [491, 3848.94] - - [1856, 1408, 1, 128] - - [482, 4202.21] + - [486, 4202.21] - - [64, 5056, 1, 256] - - [488, 3109.52] + - [492, 3109.52] - - [6784, 256, 1, 3328] - - [487, 6388.43] + - [491, 6388.43] - - [6784, 4288, 1, 3328] - - [498, 9114.57] + - [502, 9114.57] - - [4288, 448, 1, 256] - - [491, 5782.95] + - [495, 5782.95] - - [64, 704, 1, 128] - - [424, 379.419] + - [428, 379.419] - - [1856, 2368, 1, 3328] - - [487, 9128.36] + - [491, 9128.36] - - [4288, 2944, 1, 1280] - - [493, 9182.23] + - [497, 9182.23] - - [704, 5056, 1, 1280] - - [487, 9071.47] + - [491, 9071.47] - - [2368, 704, 1, 3328] - - [493, 7731.33] + - [497, 7731.33] - - [256, 5888, 1, 256] - - [487, 7920.28] + - [491, 7920.28] - - [1856, 4288, 1, 3328] - - [493, 9329.97] + - [497, 9329.97] - - [256, 2944, 1, 256] - - [494, 5312.17] + - [498, 5312.17] - - [5888, 1024, 1, 256] - - [485, 6710.87] + - [489, 6710.87] - - [448, 64, 1, 1280] - - [460, 2814.43] + - [464, 2814.43] - - [448, 5056, 1, 3328] - - [487, 8255.43] + - [491, 8255.43] - - [3584, 4, 1, 1280] - - [436, 640.715] + - [440, 640.715] - - [2944, 64, 1, 256] - - [435, 2621.44] + - [439, 2621.44] - - [128, 4, 1, 1280] - - [507, 86.2316] + - [511, 86.2316] - - [1408, 2944, 1, 256] - - [487, 8848.89] + - [491, 8848.89] - - [256, 1856, 1, 1280] - - [487, 7366.45] + - [491, 7366.45] - - [6784, 5056, 1, 3328] - - [498, 8332.06] + - [502, 8332.06] - - [5056, 5056, 1, 256] - - [493, 9171.64] + - [497, 9171.64] - - [1408, 6784, 1, 128] - - [479, 5079.09] + - [483, 5079.09] - - [64, 1024, 1, 1280] - - [451, 3679.21] + - [455, 3679.21] - - [2944, 4, 1, 256] - - [442, 369.443] + - [446, 369.443] - - [704, 5056, 1, 128] - - [479, 4509.17] + - [483, 4509.17] - - [4, 2368, 1, 1280] - - [436, 569.744] + - [440, 569.744] - - [2368, 2944, 1, 1280] - - [498, 7451.04] + - [502, 7451.04] - - [128, 3584, 1, 1280] - - [496, 6071.16] + - [500, 6071.16] - - [6784, 6784, 1, 1280] - - [493, 9535.64] + - [497, 9535.64] - - [1408, 4288, 1, 1280] - - [496, 8254.99] + - [500, 8254.99] - - [3584, 4288, 1, 1280] - - [498, 9651.09] + - [502, 9651.09] - - [2368, 704, 1, 1280] - - [493, 8291.3] + - [497, 8291.3] - - [5056, 4288, 1, 3328] - - [485, 9406.26] + - [489, 9406.26] - - [3584, 2368, 1, 3328] - - [493, 9350.22] + - [497, 9350.22] - - [64, 704, 1, 1280] - - [460, 3384.49] + - [464, 3384.49] - - [4288, 256, 1, 256] - - [493, 5593.52] + - [497, 5593.52] - - [2944, 128, 1, 128] - - [415, 2130.5] + - [419, 2130.5] - - [6784, 448, 1, 1280] - - [496, 8815.75] + - [500, 8815.75] - - [1408, 2944, 1, 128] - - [479, 4558.24] + - [483, 4558.24] - - [4288, 2944, 1, 256] - - [498, 7865.33] + - [502, 7865.33] - - [5888, 704, 1, 1280] - - [487, 9262.89] + - [491, 9262.89] - - [1856, 64, 1, 1280] - - [461, 4359.05] + - [465, 4359.05] - - [448, 5888, 1, 128] - - [482, 4000.49] + - [486, 4000.49] - - [5888, 64, 1, 3328] - - [462, 6603.29] + - [466, 6603.29] - - [2944, 256, 1, 3328] - - [487, 8423.53] + - [491, 8423.53] - - [1024, 64, 1, 128] - - [432, 582.542] + - [436, 582.542] - - [5056, 2368, 1, 1280] - - [487, 9419.81] + - [491, 9419.81] - - [448, 3584, 1, 1280] - - [487, 7985.72] + - [491, 7985.72] - - [6784, 5888, 1, 256] - - [485, 9494.26] + - [489, 9494.26] - - [704, 1024, 1, 128] - - [479, 2813.25] + - [483, 2813.25] - - [704, 128, 1, 1280] - - [461, 4477.61] + - [465, 4477.61] - - [5888, 2944, 1, 128] - - [482, 4745.86] + - [486, 4745.86] - - [4, 3584, 1, 128] - - [499, 96.379] + - [503, 96.379] - - [1408, 448, 1, 1280] - - [487, 6912.7] + - [491, 6912.7] - - [1024, 1408, 1, 256] - - [495, 5810.75] + - [499, 5810.75] - - [2368, 2368, 1, 3328] - - [496, 9088.61] + - [500, 9088.61] - - [1856, 6784, 1, 128] - - [482, 5168.22] + - [486, 5168.22] - - [5056, 704, 1, 3328] - - [488, 7464.8] + - [492, 7464.8] - - [1408, 1856, 1, 256] - - [493, 6727.59] + - [497, 6727.59] - - [1408, 704, 1, 3328] - - [493, 8379.43] + - [497, 8379.43] - - [2368, 5056, 1, 256] - - [493, 8664.01] + - [497, 8664.01] - - [5888, 1856, 1, 256] - - [498, 5809.92] + - [502, 5809.92] - - [4288, 64, 1, 3328] - - [475, 6583.84] + - [479, 6583.84] - - [2368, 4, 1, 1280] - - [508, 545.151] + - [512, 545.151] - - [704, 5888, 1, 256] - - [493, 8813.61] + - [497, 8813.61] - - [4288, 64, 1, 256] - - [451, 3059.87] + - [455, 3059.87] - - [6784, 64, 1, 256] - - [493, 3490.86] + - [497, 3490.86] - - [2944, 256, 1, 256] - - [487, 6970.3] + - [491, 6970.3] - - [2944, 6784, 1, 3328] - - [487, 9475.69] + - [491, 9475.69] - - [704, 1408, 1, 3328] - - [487, 8154.08] + - [491, 8154.08] - - [3584, 704, 1, 3328] - - [487, 8994.97] + - [491, 8994.97] - - [2944, 256, 1, 128] - - [479, 2824.03] + - [483, 2824.03] - - [6784, 4, 1, 1280] - - [436, 625.614] + - [440, 625.614] - - [1024, 64, 1, 1280] - - [448, 3307.81] + - [452, 3307.81] - - [448, 4288, 1, 256] - - [493, 6074.38] + - [497, 6074.38] - - [64, 3584, 1, 3328] - - [441, 6200.16] + - [445, 6200.16] - - [704, 2368, 1, 1280] - - [487, 8291.3] + - [491, 8291.3] - - [448, 2944, 1, 128] - - [479, 3221.77] + - [483, 3221.77] - - [1856, 2368, 1, 1280] - - [498, 6855.14] + - [502, 6855.14] - - [2368, 128, 1, 3328] - - [449, 6479.51] + - [453, 6479.51] - - [2944, 128, 1, 256] - - [487, 3828.13] + - [491, 3828.13] - - [448, 1408, 1, 256] - - [488, 4525.8] + - [492, 4525.8] - - [1856, 4288, 1, 1280] - - [486, 9160.22] + - [490, 9160.22] - - [64, 5056, 1, 3328] - - [469, 6819.2] + - [473, 6819.2] - - [4, 704, 1, 256] - - [453, 123.441] + - [457, 123.441] - - [1024, 448, 1, 128] - - [482, 1989.17] + - [486, 1989.17] - - [704, 4, 1, 1280] - - [456, 381.831] + - [460, 381.831] - - [704, 256, 1, 128] - - [479, 1109.07] + - [483, 1109.07] - - [704, 2944, 1, 128] - - [479, 4088.93] + - [483, 4088.93] - - [1408, 1024, 1, 1280] - - [493, 8191.98] + - [497, 8191.98] - - [704, 6784, 1, 256] - - [487, 6717.8] + - [491, 6717.8] - - [6784, 704, 1, 256] - - [493, 5429.12] + - [497, 5429.12] - - [5056, 1408, 1, 128] - - [479, 4954.4] + - [483, 4954.4] - - [256, 3584, 1, 3328] - - [487, 7890.86] + - [491, 7890.86] - - [4, 5888, 1, 3328] - - [504, 690.947] + - [508, 690.947] - - [128, 1408, 1, 128] - - [426, 1393.04] + - [430, 1393.04] - - [3584, 4288, 1, 3328] - - [489, 8900.77] + - [493, 8900.77] - - [5888, 1856, 1, 1280] - - [490, 9345.75] + - [494, 9345.75] - - [5056, 1024, 1, 3328] - - [491, 7834.74] + - [495, 7834.74] - - [5056, 64, 1, 1280] - - [469, 5890.04] + - [473, 5890.04] - - [1024, 704, 1, 256] - - [487, 6007.47] + - [491, 6007.47] - - [1024, 4288, 1, 128] - - [481, 3496.99] + - [485, 3496.99] - - [4288, 64, 1, 1280] - - [466, 4726.49] + - [470, 4726.49] - - [2368, 3584, 1, 1280] - - [485, 8128.72] + - [489, 8128.72] - - [2368, 6784, 1, 1280] - - [485, 9478.62] + - [489, 9478.62] - - [1024, 256, 1, 256] - - [493, 4092.0] + - [497, 4092.0] - - [1856, 4, 1, 1280] - - [508, 509.803] + - [512, 509.803] - - [448, 448, 1, 256] - - [493, 3001.18] + - [497, 3001.18] - - [2944, 3584, 1, 3328] - - [494, 9081.81] + - [498, 9081.81] - - [128, 4288, 1, 128] - - [414, 2323.23] + - [418, 2323.23] - - [64, 448, 1, 256] - - [457, 1066.87] + - [461, 1066.87] - - [128, 1024, 1, 3328] - - [470, 6392.26] + - [474, 6392.26] - - [4, 1408, 1, 3328] - - [453, 616.556] + - [457, 616.556] - - [6784, 2944, 1, 256] - - [496, 8547.63] + - [500, 8547.63] - - [64, 1856, 1, 1280] - - [469, 4409.61] + - [473, 4409.61] - - [64, 1024, 1, 128] - - [413, 554.802] + - [417, 554.802] - - [4288, 2368, 1, 3328] - - [489, 8779.98] + - [493, 8779.98] - - [1856, 2368, 1, 256] - - [496, 4976.64] + - [500, 4976.64] - - [3584, 256, 1, 128] - - [481, 2812.27] + - [485, 2812.27] - - [3584, 6784, 1, 3328] - - [491, 9278.12] + - [495, 9278.12] - - [256, 1024, 1, 256] - - [487, 4346.43] + - [491, 4346.43] - - [4, 6784, 1, 3328] - - [506, 681.266] + - [510, 681.266] - - [1024, 5888, 1, 3328] - - [487, 9187.51] + - [491, 9187.51] - - [1024, 128, 1, 1280] - - [439, 3659.95] + - [443, 3659.95] - - [4288, 128, 1, 1280] - - [493, 6019.07] + - [497, 6019.07] - - [5056, 4288, 1, 1280] - - [485, 9343.86] + - [489, 9343.86] - - [5888, 64, 1, 256] - - [487, 4692.07] + - [491, 4692.07] - - [1856, 256, 1, 1280] - - [493, 4790.28] + - [497, 4790.28] - - [64, 5888, 1, 3328] - - [461, 6702.1] + - [465, 6702.1] - - [2944, 5888, 1, 128] - - [482, 5202.55] + - [486, 5202.55] - - [704, 5888, 1, 1280] - - [487, 9264.19] + - [491, 9264.19] - - [2368, 3584, 1, 128] - - [479, 5053.61] + - [483, 5053.61] - - [6784, 5888, 1, 3328] - - [485, 7926.7] + - [489, 7926.7] - - [704, 1024, 1, 1280] - - [486, 5402.5] + - [490, 5402.5] - - [448, 256, 1, 3328] - - [469, 6124.55] + - [473, 6124.55] - - [448, 1856, 1, 128] - - [480, 2885.86] + - [484, 2885.86] - - [128, 1024, 1, 128] - - [414, 1013.12] + - [418, 1013.12] - - [2944, 4, 1, 128] - - [499, 77.5374] + - [503, 77.5374] - - [1024, 704, 1, 1280] - - [487, 7365.48] + - [491, 7365.48] - - [128, 5888, 1, 256] - - [487, 6990.51] + - [491, 6990.51] - - [1024, 5056, 1, 1280] - - [492, 9421.9] + - [496, 9421.9] - - [4288, 1024, 1, 256] - - [494, 6269.93] + - [498, 6269.93] - - [2944, 2368, 1, 128] - - [479, 4918.08] + - [483, 4918.08] - - [704, 704, 1, 3328] - - [487, 7963.55] + - [491, 7963.55] - - [704, 1408, 1, 1280] - - [487, 8347.22] + - [491, 8347.22] - - [5888, 448, 1, 1280] - - [493, 5216.95] + - [497, 5216.95] - - [3584, 256, 1, 3328] - - [487, 7802.15] + - [491, 7802.15] - - [704, 5888, 1, 3328] - - [493, 8381.36] + - [497, 8381.36] - - [704, 1856, 1, 128] - - [479, 3598.28] + - [483, 3598.28] - - [128, 3584, 1, 3328] - - [449, 7161.01] + - [453, 7161.01] - - [6784, 2368, 1, 1280] - - [498, 9464.31] + - [502, 9464.31] - - [4, 4288, 1, 128] - - [499, 132.58] + - [503, 132.58] - - [128, 704, 1, 1280] - - [461, 4463.75] + - [465, 4463.75] - - [3584, 2944, 1, 256] - - [498, 8201.14] + - [502, 8201.14] - - [1856, 128, 1, 3328] - - [440, 6575.4] + - [444, 6575.4] - - [4, 64, 1, 1280] - - [456, 43.5745] + - [460, 43.5745] - - [4, 5056, 1, 3328] - - [436, 675.215] + - [440, 675.215] - - [128, 2944, 1, 1280] - - [440, 5916.89] + - [444, 5916.89] - - [2368, 1024, 1, 3328] - - [493, 8646.74] + - [497, 8646.74] - - [128, 256, 1, 3328] - - [474, 4130.75] + - [478, 4130.75] - - [1408, 5056, 1, 3328] - - [492, 9529.65] + - [496, 9529.65] - - [1856, 1856, 1, 3328] - - [491, 8114.89] + - [495, 8114.89] - - [3584, 128, 1, 256] - - [487, 5603.08] + - [491, 5603.08] - - [448, 1408, 1, 3328] - - [487, 7072.93] + - [491, 7072.93] - - [2368, 2368, 1, 256] - - [494, 7648.66] + - [498, 7648.66] - - [4288, 4288, 1, 1280] - - [489, 9244.01] + - [493, 9244.01] - - [64, 448, 1, 1280] - - [460, 2885.23] + - [464, 2885.23] - - [1408, 4288, 1, 256] - - [487, 8080.31] + - [491, 8080.31] - - [448, 4, 1, 256] - - [505, 84.3294] + - [509, 84.3294] - - [5888, 448, 1, 128] - - [482, 3540.7] + - [486, 3540.7] - - [448, 4, 1, 1280] - - [456, 322.157] + - [460, 322.157] - - [704, 6784, 1, 3328] - - [486, 8613.48] + - [490, 8613.48] - - [5888, 5888, 1, 1280] - - [493, 9501.95] + - [497, 9501.95] - - [5056, 1024, 1, 1280] - - [496, 9110.01] + - [500, 9110.01] - - [448, 5888, 1, 3328] - - [487, 8586.33] + - [491, 8586.33] - - [128, 4, 1, 128] - - [499, 4.17959] + - [503, 4.17959] - - [1024, 2944, 1, 1280] - - [495, 7096.43] + - [499, 7096.43] - - [5056, 5888, 1, 1280] - - [486, 9693.41] + - [490, 9693.41] - - [4288, 5888, 1, 128] - - [479, 5406.36] + - [483, 5406.36] - - [256, 3584, 1, 256] - - [487, 6908.27] + - [491, 6908.27] - - [1408, 3584, 1, 128] - - [479, 4645.59] + - [483, 4645.59] - - [256, 2944, 1, 3328] - - [490, 6284.3] + - [494, 6284.3] - - [448, 3584, 1, 128] - - [482, 3675.27] + - [486, 3675.27] - - [5888, 2944, 1, 1280] - - [492, 9628.8] + - [496, 9628.8] - - [4, 6784, 1, 1280] - - [436, 688.076] + - [440, 688.076] - - [2368, 5888, 1, 128] - - [479, 5273.86] + - [483, 5273.86] - - [64, 2944, 1, 128] - - [423, 1316.44] + - [427, 1316.44] - - [3584, 5888, 1, 256] - - [493, 9239.04] + - [497, 9239.04] - - [2368, 704, 1, 128] - - [482, 3537.55] + - [486, 3537.55] - - [3584, 2944, 1, 1280] - - [487, 9324.52] + - [491, 9324.52] - - [3584, 2368, 1, 128] - - [479, 4766.24] + - [483, 4766.24] - - [5056, 704, 1, 128] - - [479, 4487.85] + - [483, 4487.85] - - [448, 2368, 1, 128] - - [482, 2876.92] + - [486, 2876.92] - - [5056, 1408, 1, 3328] - - [498, 9515.87] + - [502, 9515.87] - - [1408, 704, 1, 256] - - [490, 6836.08] + - [494, 6836.08] - - [6784, 1024, 1, 3328] - - [485, 9309.55] + - [489, 9309.55] - - [6784, 2944, 1, 3328] - - [486, 9536.48] + - [490, 9536.48] - - [2944, 5056, 1, 3328] - - [487, 9526.15] + - [491, 9526.15] - - [1856, 1856, 1, 256] - - [487, 5239.14] + - [491, 5239.14] - - [1024, 5888, 1, 128] - - [479, 4006.18] + - [483, 4006.18] - - [2048, 7133, 1, 2048] - - [485, 9827.97] + - [489, 9827.97] - - [256, 4, 1, 128] - - [500, 4.28908] + - [504, 4.28908] - - [4288, 5888, 1, 1280] - - [495, 9202.73] + - [499, 9202.73] - - [4288, 4288, 1, 256] - - [490, 5521.08] + - [494, 5521.08] - - [448, 2944, 1, 3328] - - [493, 7724.43] + - [497, 7724.43] - - [4288, 1856, 1, 1280] - - [493, 8826.24] + - [497, 8826.24] - - [1856, 2944, 1, 3328] - - [487, 9194.8] + - [491, 9194.8] - - [256, 6784, 1, 3328] - - [487, 8740.23] + - [491, 8740.23] - - [64, 5888, 1, 256] - - [487, 4766.25] + - [491, 4766.25] - - [256, 5056, 1, 128] - - [479, 2937.5] + - [483, 2937.5] - - [5056, 1024, 1, 256] - - [498, 5467.81] + - [502, 5467.81] - - [704, 64, 1, 3328] - - [475, 4818.33] + - [479, 4818.33] - - [5056, 1856, 1, 3328] - - [492, 8861.59] + - [496, 8861.59] - - [4, 2944, 1, 3328] - - [442, 662.002] + - [446, 662.002] - - [4, 5056, 1, 256] - - [502, 494.021] + - [506, 494.021] - - [1856, 1408, 1, 256] - - [487, 8674.68] + - [491, 8674.68] - - [3584, 4, 1, 128] - - [499, 108.196] + - [503, 108.196] - - [448, 448, 1, 3328] - - [461, 6457.3] + - [465, 6457.3] - - [6784, 128, 1, 3328] - - [454, 7256.61] + - [458, 7256.61] - - [4288, 1408, 1, 128] - - [482, 4791.66] + - [486, 4791.66] - - [4288, 5056, 1, 256] - - [487, 8560.74] + - [491, 8560.74] - - [1408, 128, 1, 1280] - - [469, 5085.69] + - [473, 5085.69] - - [5056, 256, 1, 3328] - - [490, 7284.13] + - [494, 7284.13] - - [704, 704, 1, 256] - - [487, 6171.09] + - [491, 6171.09] - - [1024, 5888, 1, 1280] - - [492, 8852.79] + - [496, 8852.79] - - [6784, 2368, 1, 128] - - [480, 4729.2] + - [484, 4729.2] - - [4, 5056, 1, 1280] - - [453, 669.946] + - [457, 669.946] - - [64, 128, 1, 256] - - [455, 369.217] + - [459, 369.217] - - [128, 1856, 1, 1280] - - [449, 5549.03] + - [453, 5549.03] - - [5056, 3584, 1, 256] - - [493, 7115.74] + - [497, 7115.74] - - [1856, 1024, 1, 1280] - - [485, 8196.4] + - [489, 8196.4] - - [6784, 4288, 1, 1280] - - [486, 9509.56] + - [490, 9509.56] - - [1856, 1856, 1, 1280] - - [488, 5791.89] + - [492, 5791.89] - - [6784, 2944, 1, 128] - - [479, 5317.02] + - [483, 5317.02] - - [1408, 5056, 1, 1280] - - [488, 8980.63] + - [492, 8980.63] - - [4, 2368, 1, 3328] - - [453, 592.534] + - [457, 592.534] - - [5888, 1856, 1, 128] - - [478, 4600.1] + - [482, 4600.1] - - [448, 704, 1, 1280] - - [487, 2286.48] + - [491, 2286.48] - - [2368, 1024, 1, 128] - - [482, 3911.02] + - [486, 3911.02] - - [1024, 448, 1, 3328] - - [487, 7295.14] + - [491, 7295.14] - - [1856, 704, 1, 1280] - - [487, 8881.02] + - [491, 8881.02] - - [5056, 3584, 1, 128] - - [479, 4911.58] + - [483, 4911.58] - - [5888, 5888, 1, 3328] - - [495, 9243.8] + - [499, 9243.8] - - [6784, 1024, 1, 256] - - [498, 5475.31] + - [502, 5475.31] - - [2944, 2368, 1, 256] - - [493, 5670.67] + - [497, 5670.67] - - [256, 448, 1, 256] - - [444, 2293.76] + - [448, 2293.76] - - [5056, 5888, 1, 3328] - - [488, 7847.97] + - [492, 7847.97] - - [1856, 1024, 1, 256] - - [493, 7517.6] + - [497, 7517.6] - - [448, 1408, 1, 1280] - - [487, 6917.44] + - [491, 6917.44] - - [3584, 448, 1, 1280] - - [493, 7980.76] + - [497, 7980.76] - - [1024, 1024, 1, 1280] - - [490, 8384.42] + - [494, 8384.42] - - [448, 5888, 1, 256] - - [487, 7365.65] + - [491, 7365.65] - - [704, 64, 1, 128] - - [432, 358.655] + - [436, 358.655] - - [1408, 6784, 1, 3328] - - [493, 9094.09] + - [497, 9094.09] - - [448, 1024, 1, 128] - - [482, 1772.95] + - [486, 1772.95] - - [4288, 704, 1, 128] - - [479, 4355.28] + - [483, 4355.28] - - [128, 1856, 1, 128] - - [418, 1610.63] + - [422, 1610.63] - - [448, 2368, 1, 3328] - - [493, 7366.37] + - [497, 7366.37] - - [5056, 64, 1, 128] - - [418, 2157.23] + - [422, 2157.23] - - [5056, 2944, 1, 256] - - [487, 9123.06] + - [491, 9123.06] - - [6784, 5888, 1, 128] - - [478, 5285.8] + - [482, 5285.8] - - [704, 1024, 1, 256] - - [493, 6667.25] + - [497, 6667.25] - - [1024, 4, 1, 256] - - [442, 187.246] + - [446, 187.246] - - [2368, 1856, 1, 256] - - [493, 6777.84] + - [497, 6777.84] - - [128, 6784, 1, 1280] - - [490, 7052.61] + - [494, 7052.61] - - [1408, 3584, 1, 3328] - - [494, 9037.95] + - [498, 9037.95] - - [2368, 6784, 1, 256] - - [487, 9181.35] + - [491, 9181.35] - - [5056, 1408, 1, 1280] - - [492, 9421.9] + - [496, 9421.9] - - [256, 256, 1, 128] - - [424, 543.304] + - [428, 543.304] - - [5056, 4288, 1, 128] - - [482, 5339.92] + - [486, 5339.92] - - [1408, 1856, 1, 128] - - [479, 4270.89] + - [483, 4270.89] - - [1408, 5888, 1, 3328] - - [491, 9034.79] + - [495, 9034.79] - - [1856, 256, 1, 256] - - [493, 5847.83] + - [497, 5847.83] - - [6784, 6784, 1, 256] - - [486, 9624.38] + - [490, 9624.38] - - [64, 256, 1, 128] - - [425, 146.449] + - [429, 146.449] - - [4288, 2368, 1, 128] - - [478, 3896.94] + - [482, 3896.94] - - [1856, 4288, 1, 128] - - [479, 4337.07] + - [483, 4337.07] - - [256, 4288, 1, 1280] - - [487, 7499.42] + - [491, 7499.42] - - [2368, 2944, 1, 256] - - [492, 7703.18] + - [496, 7703.18] - - [4, 1856, 1, 256] - - [505, 263.964] + - [509, 263.964] - - [3584, 1856, 1, 1280] - - [487, 9224.33] + - [491, 9224.33] - - [6784, 6784, 1, 128] - - [479, 5476.03] + - [483, 5476.03] - - [256, 1856, 1, 128] - - [482, 1858.72] + - [486, 1858.72] - - [704, 64, 1, 1280] - - [460, 3368.67] + - [464, 3368.67] - - [5888, 5056, 1, 256] - - [493, 5859.81] + - [497, 5859.81] - - [3584, 448, 1, 256] - - [493, 7298.33] + - [497, 7298.33] - - [448, 4288, 1, 128] - - [479, 3813.45] + - [483, 3813.45] - - [2944, 4288, 1, 3328] - - [488, 9149.63] + - [492, 9149.63] - - [256, 6784, 1, 256] - - [487, 7984.85] + - [491, 7984.85] - - [1408, 4288, 1, 128] - - [482, 4728.34] + - [486, 4728.34] - - [2944, 704, 1, 3328] - - [493, 7149.76] + - [497, 7149.76] - - [128, 448, 1, 256] - - [459, 1699.08] + - [463, 1699.08] - - [512, 32, 1, 512] - - [459, 1127.5] + - [463, 1127.5] - - [3584, 3584, 1, 256] - - [488, 8558.01] + - [492, 8558.01] - - [448, 1408, 1, 128] - - [479, 2504.35] + - [483, 2504.35] - - [128, 256, 1, 1280] - - [460, 3216.49] + - [464, 3216.49] - - [3584, 5056, 1, 256] - - [485, 5674.35] + - [489, 5674.35] - - [6784, 128, 1, 256] - - [487, 6216.39] + - [491, 6216.39] - - [4288, 4, 1, 256] - - [503, 435.606] + - [507, 435.606] - - [64, 1408, 1, 3328] - - [461, 6185.91] + - [465, 6185.91] - - [704, 448, 1, 256] - - [493, 4004.98] + - [497, 4004.98] - - [2944, 2368, 1, 1280] - - [494, 8542.7] + - [498, 8542.7] - - [448, 64, 1, 3328] - - [474, 3835.23] + - [478, 3835.23] - - [1408, 3584, 1, 256] - - [487, 8714.53] + - [491, 8714.53] - - [3584, 4, 1, 3328] - - [442, 689.454] + - [446, 689.454] - - [6784, 3584, 1, 256] - - [492, 9271.24] + - [496, 9271.24] - - [256, 128, 1, 128] - - [425, 283.399] + - [429, 283.399] - - [704, 1408, 1, 128] - - [479, 3210.47] + - [483, 3210.47] - - [4, 2368, 1, 256] - - [505, 360.838] + - [509, 360.838] - - [2944, 448, 1, 128] - - [479, 3344.31] + - [483, 3344.31] - - [128, 1408, 1, 256] - - [487, 3186.28] + - [491, 3186.28] - - [4, 2944, 1, 256] - - [503, 384.522] + - [507, 384.522] - - [64, 128, 1, 3328] - - [456, 2103.62] + - [460, 2103.62] - - [5056, 2368, 1, 128] - - [479, 5219.66] + - [483, 5219.66] - - [2944, 2944, 1, 3328] - - [496, 9174.59] + - [500, 9174.59] - - [5056, 6784, 1, 256] - - [498, 8992.26] + - [502, 8992.26] - - [1856, 3584, 1, 128] - - [479, 4957.17] + - [483, 4957.17] - - [128, 2944, 1, 128] - - [417, 2241.38] + - [421, 2241.38] - - [1024, 704, 1, 3328] - - [497, 6545.01] + - [501, 6545.01] - - [6784, 448, 1, 256] - - [493, 5379.15] + - [497, 5379.15] - - [3584, 6784, 1, 128] - - [479, 5101.91] + - [483, 5101.91] - - [128, 4288, 1, 256] - - [487, 5211.76] + - [491, 5211.76] - - [704, 448, 1, 3328] - - [488, 4504.05] + - [492, 4504.05] - - [1024, 1024, 1, 3328] - - [490, 8009.67] + - [494, 8009.67] - - [128, 128, 1, 3328] - - [473, 3184.93] + - [477, 3184.93] - - [5056, 1856, 1, 256] - - [487, 9138.33] + - [491, 9138.33] - - [256, 128, 1, 256] - - [459, 1205.26] + - [463, 1205.26] - - [1024, 1856, 1, 256] - - [498, 6374.99] + - [502, 6374.99] - - [4288, 64, 1, 128] - - [415, 1695.33] + - [419, 1695.33] - - [256, 448, 1, 3328] - - [462, 5659.57] + - [466, 5659.57] - - [1408, 6784, 1, 1280] - - [487, 9349.1] + - [491, 9349.1] - - [3584, 3584, 1, 1280] - - [492, 9302.09] + - [496, 9302.09] - - [64, 2368, 1, 1280] - - [461, 4432.97] + - [465, 4432.97] - - [448, 2368, 1, 1280] - - [487, 7250.67] + - [491, 7250.67] - - [5888, 5888, 1, 128] - - [479, 4615.93] + - [483, 4615.93] - - [64, 6784, 1, 3328] - - [493, 6987.13] + - [497, 6987.13] - - [2944, 256, 1, 1280] - - [496, 6127.35] + - [500, 6127.35] - - [5056, 5888, 1, 128] - - [478, 5106.29] + - [482, 5106.29] - - [256, 2368, 1, 128] - - [479, 2141.13] + - [483, 2141.13] - - [5056, 2368, 1, 3328] - - [490, 9041.65] + - [494, 9041.65] - - [2944, 4288, 1, 256] - - [498, 8691.12] + - [502, 8691.12] - - [1408, 3584, 1, 1280] - - [487, 9069.9] + - [491, 9069.9] - - [2368, 64, 1, 256] - - [459, 2412.77] + - [463, 2412.77] - - [64, 448, 1, 3328] - - [474, 3739.04] + - [478, 3739.04] - - [256, 256, 1, 3328] - - [461, 5304.08] + - [465, 5304.08] - - [5888, 4, 1, 128] - - [500, 105.555] + - [504, 105.555] - - [1856, 704, 1, 256] - - [487, 8025.33] + - [491, 8025.33] - - [4, 4288, 1, 1280] - - [434, 578.97] + - [438, 578.97] - - [1408, 448, 1, 3328] - - [495, 5714.41] + - [499, 5714.41] - - [1024, 4, 1, 3328] - - [453, 608.549] + - [457, 608.549] - - [2368, 256, 1, 256] - - [493, 5172.98] + - [497, 5172.98] - - [2368, 6784, 1, 3328] - - [493, 9456.51] + - [497, 9456.51] - - [1856, 1408, 1, 1280] - - [498, 7805.09] + - [502, 7805.09] - - [1856, 448, 1, 1280] - - [485, 6184.94] + - [489, 6184.94] - - [6784, 704, 1, 128] - - [479, 4597.77] + - [483, 4597.77] - - [4, 4, 1, 256] - - [456, 0.691892] + - [460, 0.691892] - - [128, 5888, 1, 128] - - [417, 2691.66] + - [421, 2691.66] - - [1408, 5888, 1, 256] - - [492, 7164.17] + - [496, 7164.17] - - [704, 2944, 1, 1280] - - [494, 8139.71] + - [498, 8139.71] - - [1856, 2368, 1, 128] - - [482, 4623.28] + - [486, 4623.28] - - [4096, 7133, 1, 4096] - - [486, 9939.97] + - [490, 9939.97] - - [256, 64, 1, 256] - - [450, 689.853] + - [454, 689.853] - - [1024, 1024, 1, 256] - - [493, 7216.01] + - [497, 7216.01] - - [704, 1856, 1, 256] - - [493, 6364.07] + - [497, 6364.07] - - [128, 4288, 1, 3328] - - [449, 7200.49] + - [453, 7200.49] - - [3584, 704, 1, 1280] - - [496, 7971.98] + - [500, 7971.98] - - [256, 128, 1, 1280] - - [447, 2702.52] + - [451, 2702.52] - - [2368, 4, 1, 256] - - [442, 325.918] + - [446, 325.918] - - [256, 2368, 1, 1280] - - [487, 6638.83] + - [491, 6638.83] - - [2944, 6784, 1, 128] - - [478, 5233.43] + - [482, 5233.43] - - [3584, 448, 1, 3328] - - [487, 8094.3] + - [491, 8094.3] - - [1408, 4, 1, 256] - - [505, 243.546] + - [509, 243.546] - - [704, 2368, 1, 3328] - - [487, 8403.01] + - [491, 8403.01] - - [2944, 448, 1, 256] - - [487, 7022.49] + - [491, 7022.49] - - [1856, 448, 1, 128] - - [482, 2842.69] + - [486, 2842.69] - - [2368, 128, 1, 1280] - - [469, 5685.42] + - [473, 5685.42] - - [256, 5888, 1, 128] - - [484, 2178.61] + - [488, 2178.61] - - [64, 6784, 1, 256] - - [487, 5385.13] + - [491, 5385.13] - - [64, 5056, 1, 1280] - - [461, 5603.19] + - [465, 5603.19] - - [4, 6784, 1, 128] - - [499, 180.156] + - [503, 180.156] - - [2944, 2944, 1, 1280] - - [496, 9129.29] + - [500, 9129.29] - - [5888, 2368, 1, 256] - - [498, 6961.59] + - [502, 6961.59] - - [4, 3584, 1, 1280] - - [442, 646.13] + - [446, 646.13] - - [1408, 128, 1, 128] - - [428, 1172.19] + - [432, 1172.19] - - [6784, 704, 1, 3328] - - [493, 9084.52] + - [497, 9084.52] - - [128, 64, 1, 1280] - - [472, 1260.31] + - [476, 1260.31] - - [2368, 256, 1, 1280] - - [493, 6643.38] + - [497, 6643.38] - - [4, 448, 1, 3328] - - [456, 433.414] + - [460, 433.414] - - [5888, 4288, 1, 128] - - [480, 4753.07] + - [484, 4753.07] - - [4, 5888, 1, 256] - - [442, 471.04] + - [446, 471.04] - - [1408, 2944, 1, 3328] - - [496, 9207.0] + - [500, 9207.0] - - [3584, 704, 1, 128] - - [482, 3762.36] + - [486, 3762.36] - - [64, 1024, 1, 256] - - [460, 1807.89] + - [464, 1807.89] - - [5056, 5056, 1, 128] - - [483, 4830.06] + - [487, 4830.06] - - [2368, 448, 1, 1280] - - [487, 7263.06] + - [491, 7263.06] - - [128, 3584, 1, 256] - - [490, 4369.07] + - [494, 4369.07] - - [704, 448, 1, 1280] - - [488, 4205.23] + - [492, 4205.23] - - [448, 5056, 1, 128] - - [479, 3855.47] + - [483, 3855.47] - - [256, 4, 1, 1280] - - [510, 157.538] + - [514, 157.538] - - [128, 5056, 1, 256] - - [493, 6108.96] + - [497, 6108.96] - - [1408, 5056, 1, 128] - - [482, 4836.58] + - [486, 4836.58] - - [2944, 3584, 1, 128] - - [482, 4532.09] + - [486, 4532.09] - - [3584, 2368, 1, 256] - - [487, 8951.24] + - [491, 8951.24] - - [5888, 5056, 1, 1280] - - [498, 9276.39] + - [502, 9276.39] - - [2368, 5056, 1, 128] - - [482, 5167.56] + - [486, 5167.56] - - [64, 704, 1, 256] - - [442, 1501.87] + - [446, 1501.87] - - [4288, 256, 1, 1280] - - [487, 7496.2] + - [491, 7496.2] - - [3584, 3584, 1, 3328] - - [488, 9301.67] + - [492, 9301.67] - - [1024, 256, 1, 128] - - [479, 1508.74] + - [483, 1508.74] - - [4, 704, 1, 128] - - [500, 12.0469] + - [504, 12.0469] - - [5888, 6784, 1, 256] - - [486, 9370.37] + - [490, 9370.37] - - [4288, 2944, 1, 3328] - - [490, 9148.99] + - [494, 9148.99] - - [2944, 64, 1, 128] - - [426, 1456.36] + - [430, 1456.36] - - [1856, 64, 1, 256] - - [452, 2209.93] + - [456, 2209.93] - - [4288, 128, 1, 3328] - - [446, 6471.85] + - [450, 6471.85] - - [4288, 704, 1, 1280] - - [493, 8934.51] + - [497, 8934.51] - - [256, 5056, 1, 1280] - - [487, 8439.03] + - [491, 8439.03] - - [1408, 256, 1, 128] - - [482, 1769.07] + - [486, 1769.07] - - [2944, 5888, 1, 3328] - - [487, 9447.94] + - [491, 9447.94] - - [6784, 5888, 1, 1280] - - [498, 9372.15] + - [502, 9372.15] - - [704, 128, 1, 256] - - [444, 2059.7] + - [448, 2059.7] - - [5888, 4288, 1, 1280] - - [490, 9244.22] + - [494, 9244.22] - - [448, 256, 1, 1280] - - [469, 4741.62] + - [473, 4741.62] - - [5888, 3584, 1, 128] - - [478, 4979.96] + - [482, 4979.96] - - [1856, 1856, 1, 128] - - [482, 4363.88] + - [486, 4363.88] - - [5056, 4, 1, 1280] - - [502, 629.541] + - [506, 629.541] - - [256, 1408, 1, 1280] - - [493, 5588.34] + - [497, 5588.34] - - [512, 16, 1, 512] - - [453, 689.853] + - [457, 689.853] - - [704, 3584, 1, 128] - - [482, 4069.57] + - [486, 4069.57] - - [5888, 448, 1, 3328] - - [498, 7925.84] + - [502, 7925.84] - - [2368, 4288, 1, 1280] - - [497, 8492.6] + - [501, 8492.6] - - [4288, 2944, 1, 128] - - [479, 5238.11] + - [483, 5238.11] - - [1024, 6784, 1, 3328] - - [493, 8578.08] + - [497, 8578.08] - - [128, 2368, 1, 256] - - [493, 3788.8] + - [497, 3788.8] - - [6784, 64, 1, 3328] - - [487, 7003.36] + - [491, 7003.36] - - [5056, 2944, 1, 3328] - - [490, 8575.35] + - [494, 8575.35] - - [448, 128, 1, 256] - - [442, 1714.96] + - [446, 1714.96] - - [2944, 3584, 1, 256] - - [487, 8994.16] + - [491, 8994.16] - - [1408, 1408, 1, 3328] - - [485, 8757.6] + - [489, 8757.6] - - [1856, 128, 1, 1280] - - [487, 5598.07] + - [491, 5598.07] - - [3584, 3584, 1, 128] - - [478, 4787.34] + - [482, 4787.34] - - [64, 3584, 1, 256] - - [493, 3545.91] + - [497, 3545.91] - - [1408, 4, 1, 3328] - - [437, 640.14] + - [441, 640.14] - - [128, 2944, 1, 3328] - - [461, 7204.14] + - [465, 7204.14] - - [3584, 704, 1, 256] - - [487, 6239.59] + - [491, 6239.59] - - [2944, 448, 1, 3328] - - [493, 7726.61] + - [497, 7726.61] - - [3584, 1408, 1, 3328] - - [485, 9358.68] + - [489, 9358.68] - - [704, 3584, 1, 1280] - - [493, 8005.18] + - [497, 8005.18] - - [2944, 6784, 1, 1280] - - [485, 9487.63] + - [489, 9487.63] - - [1856, 6784, 1, 256] - - [487, 5684.46] + - [491, 5684.46] - - [4288, 448, 1, 3328] - - [493, 8410.28] + - [497, 8410.28] - - [6784, 4288, 1, 128] - - [483, 4785.48] + - [487, 4785.48] - - [6784, 704, 1, 1280] - - [487, 5578.95] + - [491, 5578.95] - - [256, 4288, 1, 256] - - [487, 6781.33] + - [491, 6781.33] - - [3584, 64, 1, 128] - - [426, 1473.9] + - [430, 1473.9] - - [5888, 1024, 1, 3328] - - [485, 8639.39] + - [489, 8639.39] - - [448, 64, 1, 128] - - [417, 259.182] + - [421, 259.182] - - [704, 6784, 1, 1280] - - [493, 9027.15] + - [497, 9027.15] - - [5888, 128, 1, 256] - - [493, 6812.78] + - [497, 6812.78] - - [2368, 448, 1, 3328] - - [493, 7356.53] + - [497, 7356.53] - - [1856, 5056, 1, 3328] - - [492, 8871.46] + - [496, 8871.46] - - [4, 6784, 1, 256] - - [501, 469.379] + - [505, 469.379] - - [1024, 3584, 1, 128] - - [479, 3427.92] + - [483, 3427.92] - - [1024, 1408, 1, 128] - - [482, 2934.95] + - [486, 2934.95] - - [2368, 2944, 1, 128] - - [482, 4887.92] + - [486, 4887.92] - - [5056, 64, 1, 256] - - [451, 3186.06] + - [455, 3186.06] - - [4, 448, 1, 1280] - - [456, 273.067] + - [460, 273.067] - - [5056, 2944, 1, 128] - - [483, 4752.69] + - [487, 4752.69] - - [5888, 5056, 1, 3328] - - [497, 9124.67] + - [501, 9124.67] - - [1024, 704, 1, 128] - - [482, 2302.26] + - [486, 2302.26] - - [1408, 2368, 1, 128] - - [482, 3826.85] + - [486, 3826.85] - - [5888, 2368, 1, 128] - - [479, 4912.67] + - [483, 4912.67] - - [128, 5056, 1, 3328] - - [469, 7583.7] + - [473, 7583.7] - - [3584, 6784, 1, 1280] - - [496, 9313.4] + - [500, 9313.4] - - [3072, 7435, 1, 1024] - - [490, 9321.97] + - [494, 9321.97] - - [1856, 5888, 1, 256] - - [487, 5778.24] + - [491, 5778.24] - - [256, 256, 1, 256] - - [439, 1576.81] + - [443, 1576.81] - - [256, 64, 1, 128] - - [425, 173.605] + - [429, 173.605] - - [4288, 4288, 1, 3328] - - [492, 8416.17] + - [496, 8416.17] - - [4288, 1408, 1, 1280] - - [498, 9301.87] + - [502, 9301.87] - - [3584, 5056, 1, 128] - - [484, 4344.84] + - [488, 4344.84] - - [4, 1024, 1, 3328] - - [453, 615.139] + - [457, 615.139] - - [4288, 2368, 1, 256] - - [487, 9142.57] + - [491, 9142.57] - - [2944, 5056, 1, 1280] - - [487, 9399.59] + - [491, 9399.59] - - [448, 6784, 1, 256] - - [486, 5710.83] + - [490, 5710.83] - - [64, 1024, 1, 3328] - - [469, 4975.0] + - [473, 4975.0] - - [6784, 2368, 1, 3328] - - [496, 9207.53] + - [500, 9207.53] - - [256, 1024, 1, 1280] - - [493, 5983.32] + - [497, 5983.32] - - [704, 4, 1, 128] - - [499, 15.0187] + - [503, 15.0187] - - [256, 4, 1, 256] - - [456, 52.8516] + - [460, 52.8516] - - [4288, 128, 1, 256] - - [487, 5242.88] + - [491, 5242.88] - - [4288, 1856, 1, 3328] - - [498, 9353.96] + - [502, 9353.96] - - [3584, 448, 1, 128] - - [479, 3353.8] + - [483, 3353.8] - - [256, 4, 1, 3328] - - [510, 313.224] + - [514, 313.224] - - [4, 1408, 1, 1280] - - [453, 509.107] + - [457, 509.107] - - [3584, 64, 1, 1280] - - [441, 5198.32] + - [445, 5198.32] - - [1408, 448, 1, 128] - - [479, 2628.27] + - [483, 2628.27] - - [3584, 1024, 1, 1280] - - [493, 8534.91] + - [497, 8534.91] - - [1856, 5056, 1, 256] - - [485, 8184.39] + - [489, 8184.39] - - [4, 3584, 1, 256] - - [503, 395.476] + - [507, 395.476] - - [1024, 4288, 1, 256] - - [488, 5966.42] + - [492, 5966.42] - - [5888, 3584, 1, 3328] - - [491, 9189.33] + - [495, 9189.33] - - [4, 256, 1, 256] - - [507, 41.4785] + - [511, 41.4785] - - [5056, 3584, 1, 3328] - - [492, 9431.82] + - [496, 9431.82] - - [128, 5888, 1, 1280] - - [487, 8192.0] + - [491, 8192.0] - - [704, 448, 1, 128] - - [479, 1510.86] + - [483, 1510.86] - - [2368, 1408, 1, 1280] - - [487, 8415.55] + - [491, 8415.55] - - [5056, 2944, 1, 1280] - - [498, 9294.67] + - [502, 9294.67] - - [4, 4, 1, 128] - - [500, 0.0356549] + - [504, 0.0356549] - - [3584, 256, 1, 256] - - [487, 6749.45] + - [491, 6749.45] - - [128, 1856, 1, 3328] - - [440, 6796.99] + - [444, 6796.99] - - [1024, 6784, 1, 256] - - [493, 8782.99] + - [497, 8782.99] - - [4, 128, 1, 256] - - [453, 27.3067] + - [457, 27.3067] - - [64, 64, 1, 1280] - - [472, 712.348] + - [476, 712.348] - - [6784, 4, 1, 128] - - [500, 121.96] + - [504, 121.96] - - [2944, 1408, 1, 128] - - [482, 4430.36] + - [486, 4430.36] - - [448, 128, 1, 3328] - - [469, 5097.24] + - [473, 5097.24] - - [64, 2944, 1, 3328] - - [469, 6362.1] + - [473, 6362.1] - - [64, 4288, 1, 3328] - - [469, 6564.91] + - [473, 6564.91] - - [5056, 6784, 1, 3328] - - [493, 8121.08] + - [497, 8121.08] - - [128, 2944, 1, 256] - - [487, 4692.07] + - [491, 4692.07] - - [128, 6784, 1, 128] - - [416, 2687.36] + - [420, 2687.36] - - [3584, 4288, 1, 256] - - [493, 9193.89] + - [497, 9193.89] - - [448, 1856, 1, 256] - - [493, 6231.29] + - [497, 6231.29] - - [1856, 6784, 1, 3328] - - [498, 9191.38] + - [502, 9191.38] - - [3584, 128, 1, 3328] - - [487, 7368.37] + - [491, 7368.37] - - [64, 1856, 1, 256] - - [438, 2184.53] + - [442, 2184.53] - - [1024, 448, 1, 1280] - - [493, 6977.22] + - [497, 6977.22] - - [5888, 4288, 1, 256] - - [493, 5780.4] + - [497, 5780.4] - - [4, 448, 1, 128] - - [500, 8.96] + - [504, 8.96] - - [5056, 1408, 1, 256] - - [487, 5601.25] + - [491, 5601.25] - - [64, 256, 1, 1280] - - [453, 1927.53] + - [457, 1927.53] - - [3584, 1024, 1, 256] - - [498, 7542.74] + - [502, 7542.74] - - [256, 704, 1, 256] - - [487, 2957.52] + - [491, 2957.52] - - [5888, 5888, 1, 256] - - [498, 7344.04] + - [502, 7344.04] - - [4288, 1024, 1, 1280] - - [493, 8925.74] + - [497, 8925.74] - - [5888, 128, 1, 3328] - - [487, 8409.97] + - [491, 8409.97] - - [448, 6784, 1, 3328] - - [487, 8862.46] + - [491, 8862.46] - - [2944, 1408, 1, 1280] - - [498, 7478.83] + - [502, 7478.83] - - [1024, 32, 1, 512] - - [442, 1777.25] + - [446, 1777.25] - - [2944, 1856, 1, 3328] - - [487, 9153.33] + - [491, 9153.33] - - [2368, 64, 1, 128] - - [426, 1102.2] + - [430, 1102.2] - - [2944, 2944, 1, 128] - - [478, 4591.85] + - [482, 4591.85] - - [4, 128, 1, 3328] - - [508, 118.99] + - [512, 118.99] - - [3584, 5888, 1, 1280] - - [487, 9222.39] + - [491, 9222.39] - - [64, 4, 1, 128] - - [499, 0.93516] + - [503, 0.93516] - - [6784, 1856, 1, 1280] - - [487, 9135.97] + - [491, 9135.97] - - [2944, 5056, 1, 256] - - [493, 8860.03] + - [497, 8860.03] - - [2944, 5888, 1, 1280] - - [486, 9643.53] + - [490, 9643.53] - - [5888, 256, 1, 3328] - - [493, 8799.43] + - [497, 8799.43] - - [1856, 5888, 1, 3328] - - [493, 9457.43] + - [497, 9457.43] - - [3584, 1408, 1, 256] - - [493, 8672.43] + - [497, 8672.43] - - [704, 3584, 1, 3328] - - [493, 8525.2] + - [497, 8525.2] - - [5056, 448, 1, 1280] - - [493, 8843.67] + - [497, 8843.67] - - [3584, 1856, 1, 3328] - - [485, 8881.43] + - [489, 8881.43] - - [64, 1408, 1, 128] - - [414, 747.042] + - [418, 747.042] - - [1408, 704, 1, 1280] - - [487, 8342.83] + - [491, 8342.83] - - [2944, 1024, 1, 256] - - [498, 8079.48] + - [502, 8079.48] - - [1024, 2368, 1, 128] - - [482, 3347.48] + - [486, 3347.48] - - [2368, 4288, 1, 3328] - - [493, 9467.57] + - [497, 9467.57] - - [4, 1408, 1, 256] - - [505, 257.463] + - [509, 257.463] - - [1024, 1408, 1, 1280] - - [493, 8241.74] + - [497, 8241.74] - - [64, 64, 1, 256] - - [453, 189.959] + - [457, 189.959] - - [704, 256, 1, 3328] - - [487, 4519.18] + - [491, 4519.18] - - [6784, 5056, 1, 256] - - [486, 9133.68] + - [490, 9133.68] - - [4, 4288, 1, 3328] - - [437, 669.975] + - [441, 669.975] - - [448, 6784, 1, 128] - - [479, 4481.82] + - [483, 4481.82] - - [4, 704, 1, 3328] - - [509, 522.971] + - [513, 522.971] - - [448, 2944, 1, 256] - - [487, 7022.49] + - [491, 7022.49] - - [2944, 6784, 1, 256] - - [493, 9199.74] + - [497, 9199.74] - - [2368, 2368, 1, 1280] - - [498, 8646.74] + - [502, 8646.74] - - [4, 4, 1, 1280] - - [456, 3.01176] + - [460, 3.01176] - - [1856, 3584, 1, 1280] - - [485, 8805.35] + - [489, 8805.35] - - [64, 2944, 1, 256] - - [459, 2565.66] + - [463, 2565.66] - - [3584, 1408, 1, 1280] - - [498, 9273.02] + - [502, 9273.02] - - [448, 256, 1, 128] - - [414, 941.03] + - [418, 941.03] - - [4288, 448, 1, 128] - - [480, 3215.1] + - [484, 3215.1] - - [5056, 256, 1, 1280] - - [493, 8790.03] + - [497, 8790.03] - - [1856, 1408, 1, 3328] - - [487, 9310.63] + - [491, 9310.63] - - [128, 128, 1, 128] - - [422, 155.115] + - [426, 155.115] - - [1024, 4288, 1, 3328] - - [490, 8528.02] + - [494, 8528.02] - - [448, 2368, 1, 256] - - [494, 5097.24] + - [498, 5097.24] - - [1024, 4, 1, 128] - - [500, 10.2721] + - [504, 10.2721] - - [5056, 448, 1, 256] - - [493, 8236.68] + - [497, 8236.68] - - [2944, 2368, 1, 3328] - - [486, 9331.06] + - [490, 9331.06] - - [704, 128, 1, 3328] - - [461, 5969.2] + - [465, 5969.2] - - [64, 64, 1, 3328] - - [477, 1494.68] + - [481, 1494.68] - - [1024, 1856, 1, 1280] - - [492, 6356.33] + - [496, 6356.33] - - [6784, 1856, 1, 256] - - [493, 9068.53] + - [497, 9068.53] - - [128, 2368, 1, 3328] - - [469, 6714.12] + - [473, 6714.12] - - [1024, 5888, 1, 256] - - [493, 5501.5] + - [497, 5501.5] - - [5056, 128, 1, 1280] - - [449, 6455.54] + - [453, 6455.54] - - [5056, 64, 1, 3328] - - [454, 6703.71] + - [458, 6703.71] - - [128, 704, 1, 128] - - [415, 696.518] + - [419, 696.518] - - [1408, 2368, 1, 256] - - [487, 8667.15] + - [491, 8667.15] - - [1408, 1408, 1, 256] - - [498, 7615.71] + - [502, 7615.71] - - [4, 64, 1, 128] - - [500, 0.98463] + - [504, 0.98463] - - [64, 128, 1, 1280] - - [472, 1379.71] + - [476, 1379.71] - - [2368, 2368, 1, 128] - - [482, 4582.16] + - [486, 4582.16] - - [64, 5888, 1, 128] - - [415, 2086.27] + - [419, 2086.27] - - [5888, 4, 1, 3328] - - [436, 667.414] + - [440, 667.414] - - [6784, 1408, 1, 128] - - [483, 4516.24] + - [487, 4516.24] - - [4288, 5888, 1, 256] - - [498, 8497.33] + - [502, 8497.33] - - [1408, 5056, 1, 256] - - [487, 8867.36] + - [491, 8867.36] - - [5056, 128, 1, 3328] - - [469, 7678.88] + - [473, 7678.88] - - [128, 128, 1, 1280] - - [457, 2016.49] + - [461, 2016.49] - - [448, 704, 1, 256] - - [488, 3030.79] + - [492, 3030.79] - - [4288, 3584, 1, 128] - - [479, 5246.23] + - [483, 5246.23] - - [2944, 128, 1, 3328] - - [454, 6795.06] + - [458, 6795.06] - - [128, 5056, 1, 1280] - - [440, 6192.99] + - [444, 6192.99] - - [3584, 5056, 1, 1280] - - [492, 9499.07] + - [496, 9499.07] - - [256, 448, 1, 1280] - - [448, 4267.46] + - [452, 4267.46] - - [704, 704, 1, 128] - - [482, 2259.22] + - [486, 2259.22] - - [5056, 4, 1, 128] - - [500, 12.4313] + - [504, 12.4313] - - [704, 256, 1, 1280] - - [487, 4355.87] + - [491, 4355.87] - - [64, 2368, 1, 3328] - - [461, 6310.87] + - [465, 6310.87] - - [1856, 1024, 1, 128] - - [478, 4065.33] + - [482, 4065.33] - - [1856, 64, 1, 128] - - [417, 936.229] + - [421, 936.229] - - [64, 6784, 1, 1280] - - [440, 5731.7] + - [444, 5731.7] - - [704, 4288, 1, 256] - - [493, 5218.8] + - [497, 5218.8] - - [5888, 2368, 1, 1280] - - [487, 9378.8] + - [491, 9378.8] - - [128, 256, 1, 256] - - [457, 1219.27] + - [461, 1219.27] - - [256, 64, 1, 1280] - - [459, 1820.44] + - [463, 1820.44] - - [2368, 5888, 1, 1280] - - [498, 9143.54] + - [502, 9143.54] - - [5888, 256, 1, 1280] - - [487, 8678.37] + - [491, 8678.37] - - [4, 5888, 1, 1280] - - [434, 668.142] + - [438, 668.142] - - [704, 128, 1, 128] - - [422, 649.456] + - [426, 649.456] - - [1024, 4, 1, 1280] - - [453, 478.365] + - [457, 478.365] - - [2368, 1856, 1, 3328] - - [485, 8153.77] + - [489, 8153.77] - - [2368, 128, 1, 128] - - [420, 1858.11] + - [424, 1858.11] - - [2944, 704, 1, 256] - - [487, 8437.97] + - [491, 8437.97] - - [5056, 128, 1, 128] - - [416, 2689.53] + - [420, 2689.53] - - [256, 704, 1, 3328] - - [487, 4541.08] + - [491, 4541.08] - - [704, 3584, 1, 256] - - [488, 7770.97] + - [492, 7770.97] - - [1024, 1024, 1, 1024] - - [493, 8305.52] + - [497, 8305.52] - - [704, 2944, 1, 3328] - - [493, 9166.38] + - [497, 9166.38] - - [6784, 1024, 1, 128] - - [478, 4362.21] + - [482, 4362.21] - - [256, 448, 1, 128] - - [425, 899.514] + - [429, 899.514] - - [448, 1024, 1, 3328] - - [487, 7385.46] + - [491, 7385.46] - - [2944, 1024, 1, 3328] - - [490, 8779.71] + - [494, 8779.71] - - [2944, 5056, 1, 128] - - [482, 5103.01] + - [486, 5103.01] - - [1408, 6784, 1, 256] - - [493, 8346.79] + - [497, 8346.79] - - [6784, 1408, 1, 3328] - - [489, 8878.3] + - [493, 8878.3] - - [4288, 6784, 1, 128] - - [478, 5432.89] + - [482, 5432.89] - - [704, 64, 1, 256] - - [467, 1441.79] + - [471, 1441.79] - - [5888, 4, 1, 1280] - - [504, 636.541] + - [508, 636.541] - - [256, 2368, 1, 3328] - - [487, 6804.7] + - [491, 6804.7] - - [6784, 2944, 1, 1280] - - [486, 9472.16] + - [490, 9472.16] - - [4288, 1856, 1, 128] - - [482, 4886.28] + - [486, 4886.28] - - [1856, 2944, 1, 128] - - [479, 4642.86] + - [483, 4642.86] - - [6784, 448, 1, 128] - - [479, 4369.07] + - [483, 4369.07] - - [64, 3584, 1, 128] - - [426, 1645.75] + - [430, 1645.75] - - [448, 5056, 1, 1280] - - [487, 8553.54] + - [491, 8553.54] - - [2368, 1856, 1, 128] - - [479, 4741.75] + - [483, 4741.75] - - [128, 448, 1, 1280] - - [469, 3744.91] + - [473, 3744.91] - - [4288, 704, 1, 256] - - [487, 8444.06] + - [491, 8444.06] - - [256, 3584, 1, 128] - - [479, 2454.86] + - [483, 2454.86] - - [5888, 704, 1, 256] - - [487, 8819.47] + - [491, 8819.47] - - [3584, 1024, 1, 128] - - [482, 4094.86] + - [486, 4094.86] - - [256, 5888, 1, 3328] - - [496, 8538.23] + - [500, 8538.23] - - [1408, 4288, 1, 3328] - - [498, 9212.47] + - [502, 9212.47] - - [6784, 4288, 1, 256] - - [486, 9163.02] + - [490, 9163.02] - - [4288, 256, 1, 128] - - [479, 3081.34] + - [483, 3081.34] - - [5888, 256, 1, 256] - - [487, 7680.65] + - [491, 7680.65] - - [6784, 1024, 1, 1280] - - [498, 9248.53] + - [502, 9248.53] - - [5888, 1024, 1, 128] - - [482, 4061.84] + - [486, 4061.84] - - [1024, 128, 1, 256] - - [493, 2317.29] + - [497, 2317.29] - - [128, 64, 1, 3328] - - [476, 2116.69] + - [480, 2116.69] - - [448, 64, 1, 256] - - [459, 1079.42] + - [463, 1079.42] - - [2368, 256, 1, 128] - - [480, 2229.73] + - [484, 2229.73] - - [6784, 3584, 1, 1280] - - [493, 9096.5] + - [497, 9096.5] - - [1024, 6784, 1, 1280] - - [491, 9112.8] + - [495, 9112.8] - - [2944, 64, 1, 1280] - - [449, 4982.9] + - [453, 4982.9] - - [1408, 2944, 1, 1280] - - [488, 9131.53] + - [492, 9131.53] - - [256, 1856, 1, 256] - - [496, 4432.76] + - [500, 4432.76] - - [1408, 2368, 1, 3328] - - [496, 8449.08] + - [500, 8449.08] - - [2944, 4, 1, 3328] - - [442, 673.84] + - [446, 673.84] - - [128, 1408, 1, 3328] - - [461, 6582.37] + - [465, 6582.37] - - [2944, 1856, 1, 128] - - [479, 4827.44] + - [483, 4827.44] - - [256, 2944, 1, 128] - - [482, 2416.56] + - [486, 2416.56] - - [256, 6784, 1, 128] - - [482, 3118.66] + - [486, 3118.66] - - [2368, 4, 1, 128] - - [500, 22.6197] + - [504, 22.6197] - - [1408, 256, 1, 3328] - - [487, 3733.72] + - [491, 3733.72] - - [1856, 4, 1, 128] - - [499, 7.10009] + - [503, 7.10009] - - [1024, 16, 1, 512] - - [455, 1165.08] + - [459, 1165.08] - - [5056, 6784, 1, 128] - - [483, 4949.03] + - [487, 4949.03] - - [4288, 5056, 1, 128] - - [482, 4966.8] + - [486, 4966.8] - - [1856, 5888, 1, 128] - - [478, 4351.66] + - [482, 4351.66] - - [2944, 5888, 1, 256] - - [498, 8460.89] + - [502, 8460.89] - - [3584, 1856, 1, 256] - - [493, 8876.6] + - [497, 8876.6] - - [4288, 3584, 1, 1280] - - [486, 9603.6] + - [490, 9603.6] - - [2368, 448, 1, 256] - - [487, 6604.6] + - [491, 6604.6] - - [4288, 256, 1, 3328] - - [487, 7619.79] + - [491, 7619.79] - - [1856, 704, 1, 128] - - [479, 3629.51] + - [483, 3629.51] - - [1408, 64, 1, 256] - - [443, 2168.11] + - [447, 2168.11] - - [64, 1856, 1, 128] - - [419, 979.662] + - [423, 979.662] - - [4, 256, 1, 128] - - [500, 5.13595] + - [504, 5.13595] - - [704, 4288, 1, 3328] - - [493, 9014.42] + - [497, 9014.42] - - [704, 5888, 1, 128] - - [480, 4221.67] + - [484, 4221.67] - - [6784, 3584, 1, 128] - - [478, 5360.63] + - [482, 5360.63] - - [1024, 64, 1, 256] - - [438, 1588.75] + - [442, 1588.75] - - [64, 2368, 1, 256] - - [493, 2552.45] + - [497, 2552.45] - - [4288, 5056, 1, 3328] - - [492, 8193.28] + - [496, 8193.28] - - [4, 1856, 1, 1280] - - [442, 499.092] + - [446, 499.092] - - [4288, 128, 1, 128] - - [479, 2373.47] + - [483, 2373.47] - - [1408, 1408, 1, 128] - - [482, 3753.78] + - [486, 3753.78] - - [1024, 128, 1, 3328] - - [464, 5656.22] + - [468, 5656.22] - - [1856, 128, 1, 128] - - [415, 1617.48] + - [419, 1617.48] - - [5056, 2368, 1, 256] - - [498, 5553.31] + - [502, 5553.31] - - [4288, 704, 1, 3328] - - [486, 6961.96] + - [490, 6961.96] - - [448, 3584, 1, 256] - - [496, 5981.4] + - [500, 5981.4] - - [64, 128, 1, 128] - - [433, 74.8983] + - [437, 74.8983] - - [2368, 64, 1, 1280] - - [469, 5041.23] + - [473, 5041.23] - - [2368, 1024, 1, 1280] - - [494, 7740.87] + - [498, 7740.87] - - [2944, 1408, 1, 3328] - - [496, 9204.55] + - [500, 9204.55] - - [1408, 448, 1, 256] - - [493, 5954.3] + - [497, 5954.3] - - [1024, 1408, 1, 3328] - - [490, 8161.44] + - [494, 8161.44] - - [2560, 7133, 1, 2560] - - [485, 9636.59] + - [489, 9636.59] - - [1408, 4, 1, 1280] - - [437, 520.879] + - [441, 520.879] - - [5888, 3584, 1, 256] - - [498, 9225.16] + - [502, 9225.16] - - [128, 1024, 1, 1280] - - [440, 4755.45] + - [444, 4755.45] - - [1408, 1856, 1, 3328] - - [490, 9130.77] + - [494, 9130.77] - - [4, 4, 1, 3328] - - [510, 6.93333] + - [514, 6.93333] - - [6784, 1408, 1, 1280] - - [487, 9346.81] + - [491, 9346.81] - - [4, 1024, 1, 1280] - - [437, 422.813] + - [441, 422.813] - - [704, 2944, 1, 256] - - [493, 8331.96] + - [497, 8331.96] - - [704, 4288, 1, 128] - - [479, 4371.04] + - [483, 4371.04] - - [2368, 4288, 1, 128] - - [479, 3988.79] + - [483, 3988.79] - - [64, 4288, 1, 1280] - - [469, 5407.53] + - [473, 5407.53] - - [6784, 64, 1, 1280] - - [449, 5708.15] + - [453, 5708.15] - - [3584, 128, 1, 128] - - [415, 2463.1] + - [419, 2463.1] - - [1024, 6784, 1, 128] - - [480, 3862.02] + - [484, 3862.02] - - [4, 1856, 1, 128] - - [500, 30.5362] + - [504, 30.5362] - - [1408, 64, 1, 3328] - - [469, 6095.38] + - [473, 6095.38] - - [6784, 4, 1, 256] - - [502, 487.838] + - [506, 487.838] - - [1408, 1408, 1, 1280] - - [498, 8640.53] + - [502, 8640.53] - - [256, 2368, 1, 256] - - [490, 4282.26] + - [494, 4282.26] - - [448, 4288, 1, 3328] - - [487, 8516.03] + - [491, 8516.03] - - [2368, 1408, 1, 256] - - [493, 8632.09] + - [497, 8632.09] - - [5888, 5056, 1, 128] - - [479, 5091.01] + - [483, 5091.01] - - [704, 2368, 1, 256] - - [493, 7664.7] + - [497, 7664.7] - - [2944, 448, 1, 1280] - - [493, 7618.25] + - [497, 7618.25] - - [5888, 2368, 1, 3328] - - [496, 9343.38] + - [500, 9343.38] - - [64, 2944, 1, 1280] - - [461, 5162.08] + - [465, 5162.08] - - [448, 1856, 1, 1280] - - [487, 7027.9] + - [491, 7027.9] - - [4288, 448, 1, 1280] - - [487, 5855.66] + - [491, 5855.66] - - [5888, 704, 1, 3328] - - [496, 9190.81] + - [500, 9190.81] - - [5056, 256, 1, 128] - - [482, 3235.84] + - [486, 3235.84] - - [1856, 256, 1, 128] - - [480, 1849.68] + - [484, 1849.68] - - [5056, 128, 1, 256] - - [493, 6108.96] + - [497, 6108.96] - - [704, 4, 1, 256] - - [453, 125.156] + - [457, 125.156] - - [1408, 5888, 1, 128] - - [479, 5055.06] + - [483, 5055.06] - - [4288, 4, 1, 128] - - [499, 95.6209] + - [503, 95.6209] - - [1408, 1024, 1, 256] - - [487, 7370.18] + - [491, 7370.18] - - [1024, 1856, 1, 128] - - [479, 2966.7] + - [483, 2966.7] - - [256, 704, 1, 128] - - [481, 528.129] + - [485, 528.129] - - [256, 1024, 1, 128] - - [479, 1171.59] + - [483, 1171.59] - - [448, 1024, 1, 256] - - [493, 5624.55] + - [497, 5624.55] - - [128, 4, 1, 3328] - - [510, 191.885] + - [514, 191.885] - - [5056, 6784, 1, 1280] - - [487, 9543.97] + - [491, 9543.97] - - [704, 5056, 1, 3328] - - [494, 8790.25] + - [498, 8790.25] - - [64, 1408, 1, 1280] - - [461, 4505.6] + - [465, 4505.6] - - [3584, 5056, 1, 3328] - - [492, 9073.42] + - [496, 9073.42] - - [1856, 4, 1, 3328] - - [510, 612.775] + - [514, 612.775] - - [4, 2944, 1, 128] - - [499, 71.9145] + - [503, 71.9145] - - [2368, 2944, 1, 3328] - - [485, 9314.58] + - [489, 9314.58] - - [448, 448, 1, 1280] - - [469, 5129.81] + - [473, 5129.81] - - [2368, 3584, 1, 256] - - [487, 8998.7] + - [491, 8998.7] - - [5056, 3584, 1, 1280] - - [488, 9345.07] + - [492, 9345.07] - - [448, 4, 1, 3328] - - [510, 487.237] + - [514, 487.237] - - [1856, 2944, 1, 1280] - - [498, 8438.69] + - [502, 8438.69] - - [3584, 2368, 1, 1280] - - [493, 9298.8] + - [497, 9298.8] - - [128, 1024, 1, 256] - - [445, 2356.35] + - [449, 2356.35] - - [2944, 1408, 1, 256] - - [485, 5440.72] + - [489, 5440.72] - - [4288, 1408, 1, 3328] - - [485, 9385.99] + - [489, 9385.99] - - [3584, 64, 1, 3328] - - [441, 6310.87] + - [445, 6310.87] - - [1408, 128, 1, 256] - - [487, 2942.43] + - [491, 2942.43] - - [2944, 1024, 1, 128] - - [482, 3927.89] + - [486, 3927.89] - - [4288, 5056, 1, 1280] - - [489, 8328.48] + - [493, 8328.48] - - [5888, 6784, 1, 1280] - - [498, 9757.34] + - [502, 9757.34] - - [6784, 5056, 1, 128] - - [478, 5101.3] + - [482, 5101.3] - - [256, 1024, 1, 3328] - - [487, 6475.77] + - [491, 6475.77] - - [3584, 4, 1, 256] - - [503, 420.873] + - [507, 420.873] - - [1856, 64, 1, 3328] - - [469, 6409.1] + - [473, 6409.1] - - [64, 6784, 1, 128] - - [417, 2387.22] + - [421, 2387.22] - - [5888, 1408, 1, 3328] - - [492, 9655.79] + - [496, 9655.79] - - [5888, 64, 1, 1280] - - [487, 5870.76] + - [491, 5870.76] - - [256, 5056, 1, 256] - - [490, 6108.96] + - [494, 6108.96] - - [128, 3584, 1, 128] - - [420, 2383.13] + - [424, 2383.13] - - [448, 3584, 1, 3328] - - [485, 7092.18] + - [489, 7092.18] - - [704, 2368, 1, 128] - - [479, 3740.98] + - [483, 3740.98] - - [5888, 256, 1, 128] - - [480, 2977.44] + - [484, 2977.44] - - [4, 5056, 1, 128] - - [499, 132.62] + - [503, 132.62] - - [448, 256, 1, 256] - - [451, 2308.19] + - [455, 2308.19] - - [704, 4, 1, 3328] - - [456, 552.574] + - [460, 552.574] - - [1408, 256, 1, 256] - - [487, 4577.12] + - [491, 4577.12] - - [3584, 1856, 1, 128] - - [479, 4571.76] + - [483, 4571.76] - - [4288, 4288, 1, 128] - - [482, 5284.55] + - [486, 5284.55] - - [1856, 1024, 1, 3328] - - [493, 6362.15] + - [497, 6362.15] - - [128, 5888, 1, 3328] - - [463, 7040.73] + - [467, 7040.73] - - [1024, 5056, 1, 256] - - [498, 7855.6] + - [502, 7855.6] - - [2368, 1408, 1, 3328] - - [493, 9205.56] + - [497, 9205.56] - - [5888, 448, 1, 256] - - [490, 5538.74] + - [494, 5538.74] - - [5888, 6784, 1, 128] - - [478, 4500.75] + - [482, 4500.75] - - [2368, 4, 1, 3328] - - [456, 642.798] + - [460, 642.798] - - [6784, 5056, 1, 1280] - - [494, 9249.13] + - [498, 9249.13] - - [5056, 704, 1, 1280] - - [493, 8883.27] + - [497, 8883.27] - - [1408, 256, 1, 1280] - - [487, 5632.0] + - [491, 5632.0] - - [4288, 6784, 1, 1280] - - [493, 8843.21] + - [497, 8843.21] - - [128, 704, 1, 256] - - [451, 2045.09] + - [455, 2045.09] - - [448, 128, 1, 1280] - - [461, 3807.07] + - [465, 3807.07] - - [6784, 4, 1, 3328] - - [504, 684.571] + - [508, 684.571] - - [4288, 4, 1, 1280] - - [453, 601.825] + - [457, 601.825] - - [1024, 64, 1, 3328] - - [465, 3928.38] + - [469, 3928.38] - - [1856, 4, 1, 256] - - [503, 293.294] + - [507, 293.294] - - [64, 3584, 1, 1280] - - [487, 5265.45] + - [491, 5265.45] - - [6784, 1408, 1, 256] - - [487, 9059.26] + - [491, 9059.26] - - [3584, 5888, 1, 128] - - [479, 5084.19] + - [483, 5084.19] - - [5056, 5888, 1, 256] - - [498, 8589.99] + - [502, 8589.99] - - [2368, 1024, 1, 256] - - [490, 4493.03] + - [494, 4493.03] - - [2944, 1856, 1, 256] - - [496, 5202.31] + - [500, 5202.31] - - [1856, 6784, 1, 1280] - - [494, 9071.38] + - [498, 9071.38] - - [64, 5056, 1, 128] - - [417, 2038.32] + - [421, 2038.32] - - [5888, 64, 1, 128] - - [416, 2016.49] + - [420, 2016.49] - - [448, 704, 1, 128] - - [480, 1173.55] + - [484, 1173.55] - - [4, 1024, 1, 128] - - [499, 8.79685] + - [503, 8.79685] - - [4288, 3584, 1, 256] - - [493, 9080.16] + - [497, 9080.16] - - [1408, 704, 1, 128] - - [479, 3165.61] + - [483, 3165.61] - - [64, 256, 1, 3328] - - [473, 3126.49] + - [477, 3126.49] - - [5056, 1856, 1, 1280] - - [490, 8857.45] + - [494, 8857.45] - - [1408, 1024, 1, 3328] - - [496, 8177.02] + - [500, 8177.02] - - [2368, 256, 1, 3328] - - [487, 6810.21] + - [491, 6810.21] - - [5888, 3584, 1, 1280] - - [485, 9535.45] + - [489, 9535.45] - - [1856, 3584, 1, 3328] - - [487, 9281.81] + - [491, 9281.81] - - [5888, 128, 1, 1280] - - [493, 8136.72] + - [497, 8136.72] - - [1024, 2944, 1, 256] - - [485, 7247.86] + - [489, 7247.86] - - [448, 6784, 1, 1280] - - [493, 7013.94] + - [497, 7013.94] - - [256, 3584, 1, 1280] - - [487, 7738.54] + - [491, 7738.54] - - [448, 128, 1, 128] - - [417, 495.948] + - [421, 495.948] - - [704, 5056, 1, 256] - - [493, 8609.34] + - [497, 8609.34] - - [3584, 1024, 1, 3328] - - [486, 7765.63] + - [490, 7765.63] - - [2944, 1856, 1, 1280] - - [498, 7775.93] + - [502, 7775.93] - - [128, 256, 1, 128] - - [430, 296.208] + - [434, 296.208] - - [5056, 256, 1, 256] - - [487, 7829.63] + - [491, 7829.63] - - [2368, 3584, 1, 3328] - - [486, 8895.98] + - [490, 8895.98] - - [2944, 704, 1, 1280] - - [496, 6855.73] + - [500, 6855.73] - - [128, 4, 1, 256] - - [505, 24.8242] + - [509, 24.8242] - - [2944, 3584, 1, 1280] - - [498, 9049.12] + - [502, 9049.12] - - [1856, 5888, 1, 1280] - - [493, 9431.96] + - [497, 9431.96] - - [256, 256, 1, 1280] - - [458, 3942.02] + - [462, 3942.02] - - [5056, 448, 1, 3328] - - [498, 4587.73] + - [502, 4587.73] - - [4288, 1408, 1, 256] - - [498, 5408.73] + - [502, 5408.73] - - [3584, 64, 1, 256] - - [467, 2496.61] + - [471, 2496.61] - - [64, 1856, 1, 3328] - - [440, 5896.68] + - [444, 5896.68] - - [256, 1408, 1, 128] - - [479, 1643.07] + - [483, 1643.07] - - [5888, 1408, 1, 128] - - [478, 4436.27] + - [482, 4436.27] - - [4288, 2368, 1, 1280] - - [487, 9432.94] + - [491, 9432.94] - - [4, 4288, 1, 256] - - [502, 442.632] + - [506, 442.632] - - [256, 4288, 1, 128] - - [479, 2814.69] + - [483, 2814.69] - - [256, 128, 1, 3328] - - [468, 3951.16] + - [472, 3951.16] - - [6784, 2368, 1, 256] - - [487, 9169.89] + - [491, 9169.89] - - [5888, 128, 1, 128] - - [416, 3156.71] + - [420, 3156.71] - - [4288, 1856, 1, 256] - - [493, 5658.13] + - [497, 5658.13] - - [1856, 256, 1, 3328] - - [487, 7646.27] + - [491, 7646.27] - - [1856, 2944, 1, 256] - - [494, 6444.88] + - [498, 6444.88] - - [5056, 1024, 1, 128] - - [478, 4607.2] + - [482, 4607.2] - - [64, 5888, 1, 1280] - - [493, 5842.36] + - [497, 5842.36] - - [1760, 7133, 1, 1760] - - [486, 9097.74] + - [490, 9097.74] - - [6784, 256, 1, 128] - - [479, 3685.31] + - [483, 3685.31] - - [5888, 704, 1, 128] - - [478, 3656.13] + - [482, 3656.13] - - [6784, 64, 1, 128] - - [429, 2191.42] + - [433, 2191.42] - - [1024, 4288, 1, 1280] - - [493, 9199.22] + - [497, 9199.22] - - [2368, 5056, 1, 3328] - - [489, 9072.78] + - [493, 9072.78] - - [448, 4, 1, 128] - - [500, 5.32937] + - [504, 5.32937] - - [4, 256, 1, 3328] - - [510, 310.937] + - [514, 310.937] - - [4288, 1024, 1, 3328] - - [491, 8660.23] + - [495, 8660.23] - - [1024, 5056, 1, 3328] - - [487, 8886.66] + - [491, 8886.66] - - [1024, 1856, 1, 3328] - - [492, 8426.14] + - [496, 8426.14] - - [704, 704, 1, 1280] - - [487, 7661.7] + - [491, 7661.7] - - [128, 2368, 1, 1280] - - [461, 5746.05] + - [465, 5746.05] - - [1408, 128, 1, 3328] - - [469, 6530.77] + - [473, 6530.77] - - [3584, 256, 1, 1280] - - [493, 7633.94] + - [497, 7633.94] - - [4, 128, 1, 128] - - [500, 1.97874] + - [504, 1.97874] - - [704, 6784, 1, 128] - - [482, 4589.49] + - [486, 4589.49] - - [3584, 128, 1, 1280] - - [487, 7078.14] + - [491, 7078.14] - - [4, 256, 1, 1280] - - [456, 178.087] + - [460, 178.087] - - [128, 704, 1, 3328] - - [461, 5959.71] + - [465, 5959.71] - - [4288, 6784, 1, 256] - - [487, 9326.44] + - [491, 9326.44] - - [3584, 2944, 1, 3328] - - [489, 9114.06] + - [493, 9114.06] - - [128, 1856, 1, 256] - - [493, 3672.55] + - [497, 3672.55] - - [64, 4288, 1, 256] - - [487, 3457.41] + - [491, 3457.41] - - [4, 3584, 1, 3328] - - [436, 694.27] + - [440, 694.27] - - [64, 4, 1, 3328] - - [456, 71.4738] + - [460, 71.4738] - - [4, 64, 1, 3328] - - [456, 91.8069] + - [460, 91.8069] - - [5888, 2944, 1, 256] - - [486, 7241.45] + - [490, 7241.45] - - [2368, 6784, 1, 128] - - [482, 5229.53] + - [486, 5229.53] - - [448, 4288, 1, 1280] - - [487, 8416.3] + - [491, 8416.3] - - [448, 1856, 1, 3328] - - [487, 7161.46] + - [491, 7161.46] - - [4, 1024, 1, 256] - - [453, 187.246] + - [457, 187.246] - - [5056, 4288, 1, 256] - - [498, 8947.16] + - [502, 8947.16] - - [1024, 448, 1, 256] - - [493, 5318.86] + - [497, 5318.86] - - [1024, 3584, 1, 256] - - [488, 6151.94] + - [492, 6151.94] - - [2944, 128, 1, 1280] - - [469, 6053.53] + - [473, 6053.53] - - [1856, 5056, 1, 128] - - [479, 5091.32] + - [483, 5091.32] - - [64, 256, 1, 256] - - [442, 771.012] + - [446, 771.012] - - [1408, 4, 1, 128] - - [499, 40.7758] + - [503, 40.7758] - - [128, 2368, 1, 128] - - [427, 1520.27] + - [431, 1520.27] - - [256, 704, 1, 1280] - - [487, 4329.71] + - [491, 4329.71] - - [64, 2368, 1, 128] - - [418, 1212.42] + - [422, 1212.42] - - [6784, 6784, 1, 3328] - - [498, 8310.57] + - [502, 8310.57] - - [448, 5888, 1, 1280] - - [493, 8502.23] + - [497, 8502.23] - - [5056, 448, 1, 128] - - [479, 4160.9] + - [483, 4160.9] - - [3584, 2944, 1, 128] - - [479, 4363.41] + - [483, 4363.41] - - [6784, 256, 1, 1280] - - [493, 8629.57] + - [497, 8629.57] - - [256, 2944, 1, 1280] - - [493, 7277.38] + - [497, 7277.38] - - [64, 4288, 1, 128] - - [418, 1821.96] + - [422, 1821.96] - - [2368, 5888, 1, 3328] - - [487, 9017.42] + - [491, 9017.42] - - [4, 64, 1, 256] - - [453, 16.0627] + - [457, 16.0627] - - [704, 1024, 1, 3328] - - [493, 8059.45] + - [497, 8059.45] - - [2368, 1856, 1, 1280] - - [493, 8813.14] + - [497, 8813.14] - - [128, 448, 1, 128] - - [414, 588.144] + - [418, 588.144] - - [128, 6784, 1, 256] - - [493, 6538.18] + - [497, 6538.18] - - [3584, 4288, 1, 128] - - [479, 5025.36] + - [483, 5025.36] - - [64, 448, 1, 128] - - [431, 231.693] + - [435, 231.693] - - [5888, 4288, 1, 3328] - - [487, 9515.78] + - [491, 9515.78] - - [2368, 704, 1, 256] - - [493, 7642.74] + - [497, 7642.74] - - [256, 1856, 1, 3328] - - [493, 6547.07] + - [497, 6547.07] - - [1856, 128, 1, 256] - - [487, 3782.18] + - [491, 3782.18] - - [6784, 128, 1, 128] - - [421, 2835.44] + - [425, 2835.44] - - [3584, 1408, 1, 128] - - [478, 3049.11] + - [482, 3049.11] - - [1856, 5056, 1, 1280] - - [494, 8863.2] + - [498, 8863.2] - - [2944, 1024, 1, 1280] - - [498, 8873.15] + - [502, 8873.15] - - [5056, 4, 1, 256] - - [434, 494.021] + - [438, 494.021] - - [3584, 5888, 1, 3328] - - [486, 9585.15] + - [490, 9585.15] - - [2368, 4288, 1, 256] - - [498, 6418.95] + - [502, 6418.95] - - [1024, 2368, 1, 3328] - - [493, 8645.26] + - [497, 8645.26] - - [64, 704, 1, 3328] - - [475, 4399.83] + - [479, 4399.83] - - [704, 1408, 1, 256] - - [487, 7428.44] + - [491, 7428.44] - - [6784, 1856, 1, 3328] - - [498, 9163.56] + - [502, 9163.56] - - [1024, 2944, 1, 128] - - [482, 3551.88] + - [486, 3551.88] - - [1024, 3584, 1, 1280] - - [496, 9112.37] + - [500, 9112.37] - - [4288, 5888, 1, 3328] - - [486, 8523.95] + - [490, 8523.95] - - [4288, 4, 1, 3328] - - [453, 619.916] + - [457, 619.916] - - [256, 1408, 1, 256] - - [487, 4505.6] + - [491, 4505.6] - - [448, 2944, 1, 1280] - - [487, 7612.77] + - [491, 7612.77] - - [4, 5888, 1, 128] - - [499, 174.464] + - [503, 174.464] - - [1024, 2944, 1, 3328] - - [492, 9136.64] + - [496, 9136.64] - - [3584, 6784, 1, 256] - - [492, 7253.79] + - [496, 7253.79] - - [256, 6784, 1, 1280] - - [487, 8637.62] + - [491, 8637.62] - - [1856, 3584, 1, 256] - - [493, 8199.57] + - [497, 8199.57] - - [128, 448, 1, 3328] - - [474, 4799.82] + - [478, 4799.82] - - [6784, 1856, 1, 128] - - [479, 5185.52] + - [483, 5185.52] - - [4, 448, 1, 256] - - [453, 86.8848] + - [457, 86.8848] - - [2944, 704, 1, 128] - - [482, 3798.54] + - [486, 3798.54] - - [256, 5888, 1, 1280] - - [487, 8678.37] + - [491, 8678.37] - - [4, 128, 1, 1280] - - [456, 102.4] + - [460, 102.4] - - [4288, 6784, 1, 3328] - - [492, 8209.3] + - [496, 8209.3] - - [6784, 128, 1, 1280] - - [469, 6562.89] + - [473, 6562.89] - - [64, 1408, 1, 256] - - [459, 2059.7] + - [463, 2059.7] - - [7680, 5481, 1, 2560] - - [498, 9426.69] + - [502, 9426.69] - - [2368, 1408, 1, 128] - - [479, 4532.4] + - [483, 4532.4] - - [1856, 448, 1, 256] - - [487, 6275.38] + - [491, 6275.38] - - [1408, 1024, 1, 128] - - [479, 3604.48] + - [483, 3604.48] - - [128, 64, 1, 128] - - [414, 87.3813] + - [418, 87.3813] - - [6784, 3584, 1, 3328] - - [494, 8991.82] + - [498, 8991.82] - - [2944, 64, 1, 3328] - - [463, 6043.26] + - [467, 6043.26] - - [64, 64, 1, 128] - - [419, 36.209] + - [423, 36.209] - - [2368, 5056, 1, 1280] - - [493, 9438.38] + - [497, 9438.38] - - [64, 4, 1, 1280] - - [456, 40.1569] + - [460, 40.1569] - - [1408, 2368, 1, 1280] - - [489, 7738.06] + - [493, 7738.06] - - [128, 1408, 1, 1280] - - [461, 4937.64] + - [465, 4937.64] - - [256, 64, 1, 3328] - - [471, 2683.36] + - [475, 2683.36] - - [2944, 4288, 1, 128] - - [479, 5173.71] + - [483, 5173.71] - - [2944, 2944, 1, 256] - - [487, 8943.82] + - [491, 8943.82] - - [2944, 4, 1, 1280] - - [436, 617.757] + - [440, 617.757] - - [5888, 4, 1, 256] - - [502, 483.118] + - [506, 483.118] - - [6784, 256, 1, 256] - - [493, 7916.6] + - [497, 7916.6] - - [256, 5056, 1, 3328] - - [487, 8953.15] + - [491, 8953.15] - - [128, 4288, 1, 1280] - - [440, 6014.95] + - [444, 6014.95] - - [5056, 1856, 1, 128] - - [481, 4221.05] + - [485, 4221.05] - - [5888, 1408, 1, 256] - - [492, 9144.75] + - [496, 9144.75] - - [128, 128, 1, 256] - - [442, 759.838] + - [446, 759.838] - - [5056, 4, 1, 3328] - - [502, 642.718] + - [506, 642.718] - - [4288, 3584, 1, 3328] - - [488, 9299.95] + - [492, 9299.95] - - [448, 704, 1, 3328] - - [494, 4480.98] + - [498, 4480.98] - - [448, 448, 1, 128] - - [418, 1360.71] + - [422, 1360.71] - - [1024, 2368, 1, 1280] - - [487, 8570.19] + - [491, 8570.19] - - [1856, 704, 1, 3328] - - [487, 8448.16] + - [491, 8448.16] - - [4, 2368, 1, 128] - - [499, 64.4902] + - [503, 64.4902] - - [5888, 6784, 1, 3328] - - [494, 9447.02] + - [498, 9447.02] - - [704, 4288, 1, 1280] - - [496, 7476.77] + - [500, 7476.77] - - [704, 256, 1, 256] - - [487, 2957.52] + - [491, 2957.52] - - [6784, 448, 1, 3328] - - [490, 8886.12] + - [494, 8886.12] - - [4288, 1024, 1, 128] - - [478, 3864.39] + - [482, 3864.39] - - [49, 512, 128, 2048] - - [521, 7112.68] + - [525, 7112.68] - - [196, 256, 256, 1024] - - [515, 8302.6] + - [519, 8302.6] - - [784, 512, 256, 128] - - [513, 9061.26] + - [517, 9061.26] - - [49, 2048, 128, 512] - - [511, 6963.26] + - [515, 6963.26] - - [784, 128, 128, 512] - - [520, 8983.53] + - [524, 8983.53] - - [3136, 256, 256, 64] - - [516, 9051.28] + - [520, 9051.28] - - [3136, 64, 128, 64] - - [512, 8581.25] + - [516, 8581.25] - - [49, 2048, 256, 512] - - [511, 7049.54] + - [515, 7049.54] - - [784, 128, 256, 512] - - [522, 9102.89] + - [526, 9102.89] - - [196, 256, 128, 1024] - - [514, 8085.79] + - [518, 8085.79] - - [3136, 64, 128, 256] - - [518, 9381.29] + - [522, 9381.29] - - [3136, 256, 128, 64] - - [516, 8982.54] + - [520, 8982.54] - - [784, 512, 128, 128] - - [513, 8965.89] + - [517, 8965.89] - - [3136, 64, 256, 256] - - [518, 9566.33] + - [522, 9566.33] - - [3136, 64, 256, 64] - - [512, 8743.7] + - [516, 8743.7] - - [196, 1024, 128, 256] - - [515, 8119.33] + - [519, 8119.33] - - [49, 512, 256, 2048] - - [524, 7166.31] + - [528, 7166.31] - - [196, 1024, 256, 256] - - [515, 8210.56] + - [519, 8210.56] - - [5329, 160, 64, 64] - - [531, 8156.79] + - [535, 8156.79] - - [1225, 384, 64, 192] - - [528, 9162.25] + - [532, 9162.25] - - [289, 1024, 64, 256] - - [528, 8483.73] + - [532, 8483.73] - - [64, 1536, 64, 384] - - [538, 9323.55] + - [542, 9323.55] - - [1225, 384, 64, 64] - - [537, 8158.7] + - [541, 8158.7] - - [1225, 384, 64, 96] - - [528, 8540.6] + - [532, 8540.6] - - [64, 1536, 64, 256] - - [534, 9142.9] + - [538, 9142.9] - - [289, 1024, 64, 384] - - [526, 8725.56] + - [530, 8725.56] - - [289, 1024, 64, 192] - - [528, 8313.06] + - [532, 8313.06] - - [289, 1024, 64, 128] - - [534, 7989.41] + - [538, 7989.41] - - [4096, 1024, 1, 2984] - - [573, 9846.29] + - [577, 9846.29] - - [1024, 4096, 1, 3437] - - [574, 9915.7] + - [578, 9915.7] - - [1024, 4096, 1, 3235] - - [567, 9913.92] + - [571, 9913.92] - - [4096, 1024, 1, 4032] - - [573, 9925.96] + - [577, 9925.96] - - [1024, 4096, 1, 3334] - - [574, 9918.17] + - [578, 9918.17] - - [4096, 1024, 1, 3288] - - [574, 9854.57] + - [578, 9854.57] - - [1024, 4096, 1, 3515] - - [574, 9923.93] + - [578, 9923.93] - - [4096, 1024, 1, 3437] - - [574, 9869.53] + - [578, 9869.53] - - [1024, 4096, 1, 3259] - - [574, 9907.55] + - [578, 9907.55] - - [1024, 4096, 1, 3384] - - [566, 9921.11] + - [570, 9921.11] - - [64, 92, 688, 92] - - [544, 6137.79] + - [548, 6137.79] - - [4096, 1024, 1, 3458] - - [573, 9887.59] + - [577, 9887.59] - - [1024, 4096, 1, 3412] - - [573, 9930.46] + - [577, 9930.46] - - [1024, 4096, 1, 3529] - - [567, 9924.44] + - [571, 9924.44] - - [1024, 4096, 1, 4032] - - [574, 9963.38] + - [578, 9963.38] - - [4096, 1024, 1, 3999] - - [574, 9894.9] + - [578, 9894.9] - - [1024, 4096, 1, 3079] - - [567, 9894.48] + - [571, 9894.48] - - [1024, 4096, 1, 3876] - - [566, 9949.29] + - [570, 9949.29] - - [1024, 4096, 1, 3450] - - [574, 9915.55] + - [578, 9915.55] - - [1024, 4096, 1, 3256] - - [574, 9911.08] + - [578, 9911.08] - - [4096, 1024, 1, 3403] - - [573, 9858.83] + - [577, 9858.83] - - [1024, 1024, 1, 3975] - - [564, 8990.71] + - [568, 8990.71] - - [1024, 4096, 1, 3359] - - [574, 9914.9] + - [578, 9914.9] - - [4096, 1024, 1, 3549] - - [573, 9870.56] + - [577, 9870.56] - - [4096, 1024, 1, 3176] - - [574, 9855.82] + - [578, 9855.82] - - [1024, 4096, 1, 3504] - - [566, 9934.07] + - [570, 9934.07] - - [4096, 1024, 1, 3314] - - [573, 9873.8] + - [577, 9873.8] - - [4096, 1024, 1, 3183] - - [573, 9843.74] + - [577, 9843.74] - - [1024, 4096, 1, 3209] - - [567, 9904.87] + - [571, 9904.87] - - [1024, 4096, 1, 3720] - - [566, 9934.06] + - [570, 9934.06] - - [1024, 4096, 1, 3859] - - [566, 9952.43] + - [570, 9952.43] - - [1024, 33708, 1, 4059] - - [566, 10321.4] + - [570, 10321.4] - - [1024, 4096, 1, 3968] - - [566, 9955.86] + - [570, 9955.86] - - [64, 123, 528, 123] - - [539, 6916.11] + - [543, 6916.11] - - [4096, 1024, 1, 3477] - - [574, 9871.93] + - [578, 9871.93] - - [4096, 1024, 1, 3233] - - [574, 9862.25] + - [578, 9862.25] - - [4096, 1024, 1, 3409] - - [574, 9876.76] + - [578, 9876.76] - - [4096, 1024, 1, 3564] - - [574, 9870.39] + - [578, 9870.39] - - [64, 102, 624, 100] - - [539, 5773.06] + - [543, 5773.06] - - [4096, 1024, 1, 3190] - - [573, 9850.87] + - [577, 9850.87] - - [64, 112, 576, 111] - - [539, 6517.25] + - [543, 6517.25] - - [1024, 4096, 1, 3288] - - [573, 9911.8] + - [577, 9911.8] - - [4096, 1024, 1, 3451] - - [573, 9859.51] + - [577, 9859.51] - - [1024, 4096, 1, 3348] - - [566, 9915.37] + - [570, 9915.37] - - [64, 102, 624, 102] - - [539, 5783.6] + - [543, 5783.6] - - [1024, 4096, 1, 3465] - - [567, 9913.02] + - [571, 9913.02] - - [1024, 33708, 1, 4032] - - [566, 10340.3] + - [570, 10340.3] - - [1024, 33708, 1, 3840] - - [566, 10341.7] + - [570, 10341.7] - - [4096, 1024, 1, 3391] - - [574, 9861.67] + - [578, 9861.67] - - [1024, 4096, 1, 3530] - - [566, 9920.34] + - [570, 9920.34] - - [4096, 1024, 1, 3209] - - [573, 9846.9] + - [577, 9846.9] - - [1024, 4096, 1, 3457] - - [567, 9917.19] + - [571, 9917.19] - - [1024, 4096, 1, 3386] - - [566, 9917.55] + - [570, 9917.55] - - [4096, 1024, 1, 3350] - - [573, 9884.44] + - [577, 9884.44] - - [1024, 4096, 1, 3184] - - [574, 9925.88] + - [578, 9925.88] - - [1024, 4096, 1, 3093] - - [573, 9902.45] + - [577, 9902.45] - - [64, 133, 480, 135] - - [556, 6205.87] + - [560, 6205.87] - - [1024, 4096, 1, 3400] - - [566, 9917.0] + - [570, 9917.0] - - [1024, 1024, 1, 4026] - - [572, 9014.29] + - [576, 9014.29] - - [1024, 4096, 1, 3214] - - [566, 9895.84] + - [570, 9895.84] - - [4096, 1024, 1, 3406] - - [574, 9857.72] + - [578, 9857.72] - - [1024, 4096, 1, 3565] - - [573, 9919.27] + - [577, 9919.27] - - [4096, 1024, 1, 3536] - - [574, 9888.96] + - [578, 9888.96] - - [1024, 4096, 1, 3183] - - [573, 9907.45] + - [577, 9907.45] - - [1024, 4096, 1, 3462] - - [574, 9922.3] + - [578, 9922.3] - - [4096, 1024, 1, 3130] - - [567, 9845.94] + - [571, 9845.94] - - [4096, 1024, 1, 3381] - - [574, 9868.17] + - [578, 9868.17] - - [4096, 1024, 1, 3298] - - [573, 9870.44] + - [577, 9870.44] - - [1024, 4096, 1, 3292] - - [566, 9906.2] + - [570, 9906.2] - - [4096, 1024, 1, 3289] - - [573, 9856.45] + - [577, 9856.45] - - [64, 160, 400, 159] - - [559, 7427.74] + - [563, 7427.74] - - [1024, 4096, 1, 3379] - - [566, 9916.99] + - [570, 9916.99] - - [1024, 4096, 1, 3990] - - [567, 9947.27] + - [571, 9947.27] - - [1024, 4096, 1, 3540] - - [574, 9935.66] + - [578, 9935.66] - - [4096, 1024, 1, 3412] - - [574, 9867.46] + - [578, 9867.46] - - [1024, 1024, 1, 3780] - - [569, 9036.16] + - [573, 9036.16] - - [1024, 4096, 1, 3555] - - [573, 9927.27] + - [577, 9927.27] - - [1024, 4096, 1, 3518] - - [567, 9925.45] + - [571, 9925.45] - - [4096, 1024, 1, 3189] - - [573, 9861.14] + - [577, 9861.14] - - [1024, 4096, 1, 3298] - - [567, 9923.12] + - [571, 9923.12] - - [4096, 1024, 1, 3072] - - [573, 9871.98] + - [577, 9871.98] - - [1024, 4096, 1, 3393] - - [574, 9929.18] + - [578, 9929.18] - - [1024, 4096, 1, 3207] - - [566, 9912.71] + - [570, 9912.71] - - [64, 228, 272, 232] - - [562, 7350.04] + - [566, 7350.04] - - [64, 23, 2720, 23] - - [543, 2640.15] + - [547, 2640.15] - - [4096, 1024, 1, 3487] - - [574, 9860.81] + - [578, 9860.81] - - [1024, 1024, 1, 3822] - - [572, 8993.86] + - [576, 8993.86] - - [64, 77, 816, 77] - - [544, 5273.09] + - [548, 5273.09] - - [4096, 1024, 1, 3431] - - [574, 9867.43] + - [578, 9867.43] - - [4096, 1024, 1, 3378] - - [573, 9888.04] + - [577, 9888.04] - - [4096, 1024, 1, 3529] - - [567, 9879.4] + - [571, 9879.4] - - [4096, 1024, 1, 3460] - - [574, 9877.15] + - [578, 9877.15] - - [1024, 4096, 1, 3336] - - [566, 9912.31] + - [570, 9912.31] - - [1024, 4096, 1, 3501] - - [567, 9914.3] + - [571, 9914.3] - - [64, 159, 400, 159] - - [557, 7016.41] + - [561, 7016.41] - - [1024, 4096, 1, 3584] - - [574, 9940.49] + - [578, 9940.49] - - [64, 135, 480, 134] - - [557, 6241.29] + - [561, 6241.29] - - [64, 99, 624, 99] - - [548, 5617.29] + - [552, 5617.29] - - [4096, 1024, 1, 2499] - - [573, 9813.47] + - [577, 9813.47] - - [1024, 1024, 1, 3942] - - [569, 9059.91] + - [573, 9059.91] - - [4096, 1024, 1, 3352] - - [573, 9867.02] + - [577, 9867.02] - - [1024, 4096, 1, 3543] - - [574, 9928.67] + - [578, 9928.67] - - [1024, 4096, 1, 3476] - - [573, 9931.48] + - [577, 9931.48] - - [1024, 33708, 1, 3822] - - [566, 10324.6] + - [570, 10324.6] - - [1024, 4096, 1, 3436] - - [566, 9917.18] + - [570, 9917.18] - - [1024, 1024, 1, 3861] - - [565, 8998.39] + - [569, 8998.39] - - [1024, 1024, 1, 4000] - - [570, 9058.2] + - [574, 9058.2] - - [1024, 4096, 1, 3594] - - [566, 9927.78] + - [570, 9927.78] - - [4096, 1024, 1, 3514] - - [574, 9872.2] + - [578, 9872.2] - - [1024, 4096, 1, 3064] - - [573, 9907.0] + - [577, 9907.0] - - [4096, 1024, 1, 3371] - - [566, 9857.64] + - [570, 9857.64] - - [4096, 1024, 1, 3558] - - [574, 9876.21] + - [578, 9876.21] - - [4096, 1024, 1, 3517] - - [573, 9866.35] + - [577, 9866.35] - - [4096, 1024, 1, 3144] - - [573, 9846.26] + - [577, 9846.26] - - [1024, 4096, 1, 3312] - - [566, 9932.75] + - [570, 9932.75] - - [4096, 1024, 1, 3079] - - [573, 9851.0] + - [577, 9851.0] - - [1024, 4096, 1, 3415] - - [566, 9919.37] + - [570, 9919.37] - - [1024, 4096, 1, 3221] - - [573, 9908.08] + - [577, 9908.08] - - [1024, 4096, 1, 3978] - - [567, 9944.31] + - [571, 9944.31] - - [4096, 1024, 1, 3876] - - [573, 9898.89] + - [577, 9898.89] - - [1024, 4096, 1, 3528] - - [566, 9919.5] + - [570, 9919.5] - - [1024, 4096, 1, 3181] - - [574, 9894.76] + - [578, 9894.76] - - [4096, 1024, 1, 3445] - - [573, 9878.44] + - [577, 9878.44] - - [4096, 1024, 1, 3450] - - [566, 9864.72] + - [570, 9864.72] - - [4096, 1024, 1, 3377] - - [573, 9879.59] + - [577, 9879.59] - - [1024, 4096, 1, 3532] - - [567, 9928.09] + - [571, 9928.09] - - [1024, 33708, 1, 3944] - - [566, 10329.6] + - [570, 10329.6] - - [4096, 1024, 1, 3483] - - [573, 9861.73] + - [577, 9861.73] - - [1024, 4096, 1, 3358] - - [566, 9903.59] + - [570, 9903.59] - - [4096, 1024, 1, 3464] - - [573, 9876.74] + - [577, 9876.74] - - [4096, 1024, 1, 3282] - - [566, 9859.13] + - [570, 9859.13] - - [4096, 1024, 1, 3256] - - [574, 9855.0] + - [578, 9855.0] - - [1024, 4096, 1, 3057] - - [573, 9910.65] + - [577, 9910.65] - - [4096, 1024, 1, 3481] - - [573, 9866.19] + - [577, 9866.19] - - [4096, 1024, 1, 3340] - - [573, 9862.15] + - [577, 9862.15] - - [1024, 1024, 1, 3870] - - [572, 9082.35] + - [576, 9082.35] - - [1024, 4096, 1, 3273] - - [566, 9916.19] + - [570, 9916.19] - - [64, 65, 992, 65] - - [557, 4682.91] + - [561, 4682.91] - - [4096, 1024, 1, 3392] - - [567, 9881.02] + - [571, 9881.02] - - [4096, 1024, 1, 3337] - - [573, 9864.4] + - [577, 9864.4] - - [4096, 1024, 1, 3359] - - [573, 9874.32] + - [577, 9874.32] - - [4096, 1024, 1, 3498] - - [574, 9864.25] + - [578, 9864.25] - - [4096, 1024, 1, 3169] - - [573, 9851.0] + - [577, 9851.0] - - [1024, 33708, 1, 3859] - - [567, 10332.5] + - [571, 10332.5] - - [64, 19, 3264, 19] - - [543, 2182.04] + - [547, 2182.04] - - [1024, 4096, 1, 3103] - - [566, 9898.8] + - [570, 9898.8] - - [4096, 1024, 1, 3900] - - [573, 9897.02] + - [577, 9897.02] - - [1024, 4096, 1, 3442] - - [573, 9938.87] + - [577, 9938.87] - - [1024, 4096, 1, 3248] - - [573, 9939.82] + - [577, 9939.82] - - [1024, 4096, 1, 3351] - - [574, 9923.13] + - [578, 9923.13] - - [4096, 1024, 1, 3593] - - [573, 9894.26] + - [577, 9894.26] - - [1024, 4096, 1, 3780] - - [573, 9941.86] + - [577, 9941.86] - - [64, 133, 480, 133] - - [557, 6180.69] + - [561, 6180.69] - - [1024, 33708, 1, 3681] - - [566, 10332.2] + - [570, 10332.2] - - [4096, 1024, 1, 3374] - - [567, 9859.26] + - [571, 9859.26] - - [1024, 4096, 1, 3557] - - [566, 9928.1] + - [570, 9928.1] - - [4096, 1024, 1, 3906] - - [573, 9906.97] + - [577, 9906.97] - - [4096, 1024, 1, 3504] - - [573, 9885.95] + - [577, 9885.95] - - [1024, 4096, 1, 3270] - - [573, 9916.27] + - [577, 9916.27] - - [4096, 1024, 1, 3098] - - [566, 9854.66] + - [570, 9854.66] - - [64, 232, 272, 232] - - [562, 7394.0] + - [566, 7394.0] - - [4096, 1024, 1, 3216] - - [574, 9876.47] + - [578, 9876.47] - - [64, 148, 432, 148] - - [559, 6663.75] + - [563, 6663.75] - - [1024, 4096, 1, 3550] - - [573, 9920.18] + - [577, 9920.18] - - [4096, 1024, 1, 3449] - - [567, 9870.47] + - [571, 9870.47] - - [1024, 4096, 1, 3403] - - [574, 9908.11] + - [578, 9908.11] - - [1024, 4096, 1, 3523] - - [573, 9932.61] + - [577, 9932.61] - - [1024, 4096, 1, 3486] - - [573, 9917.36] + - [577, 9917.36] - - [1024, 4096, 1, 3564] - - [573, 9923.34] + - [577, 9923.34] - - [1024, 33708, 1, 4005] - - [566, 10339.4] + - [570, 10339.4] - - [4096, 1024, 1, 3296] - - [573, 9879.68] + - [577, 9879.68] - - [1024, 4096, 1, 3263] - - [566, 9907.07] + - [570, 9907.07] - - [64, 25, 2512, 25] - - [543, 2848.07] + - [547, 2848.07] - - [1024, 4096, 1, 3130] - - [574, 9900.0] + - [578, 9900.0] - - [1024, 4096, 1, 3295] - - [574, 9895.35] + - [578, 9895.35] - - [1024, 33708, 1, 3925] - - [567, 10342.2] + - [571, 10342.2] - - [1024, 4096, 1, 3378] - - [566, 9921.27] + - [570, 9921.27] - - [4096, 1024, 1, 3720] - - [574, 9885.72] + - [578, 9885.72] - - [4096, 1024, 1, 3399] - - [573, 9880.55] + - [577, 9880.55] - - [4096, 1024, 1, 3543] - - [574, 9870.63] + - [578, 9870.63] - - [64, 9, 6544, 9] - - [546, 955.07] + - [550, 955.07] - - [4096, 1024, 1, 3497] - - [573, 9868.33] + - [577, 9868.33] - - [4096, 1024, 1, 3594] - - [574, 9876.78] + - [578, 9876.78] - - [1024, 4096, 1, 3144] - - [574, 9901.86] + - [578, 9901.86] - - [1024, 4096, 1, 3975] - - [567, 9950.09] + - [571, 9950.09] - - [4096, 1024, 1, 3205] - - [574, 9855.97] + - [578, 9855.97] - - [1024, 33708, 1, 3995] - - [566, 10331.0] + - [570, 10331.0] - - [1024, 4096, 1, 3392] - - [566, 9935.68] + - [570, 9935.68] - - [1024, 4096, 1, 3055] - - [574, 9893.15] + - [578, 9893.15] - - [1024, 4096, 1, 4026] - - [574, 9940.12] + - [578, 9940.12] - - [4096, 1024, 1, 3557] - - [573, 9883.9] + - [577, 9883.9] - - [4096, 1024, 1, 3515] - - [573, 9871.84] + - [577, 9871.84] - - [4096, 1024, 1, 3486] - - [574, 9860.64] + - [578, 9860.64] - - [4096, 1024, 1, 3457] - - [574, 9885.27] + - [578, 9885.27] - - [1024, 4096, 1, 3511] - - [566, 9928.14] + - [570, 9928.14] - - [4096, 1024, 1, 3138] - - [573, 9853.96] + - [577, 9853.96] - - [1024, 4096, 1, 3339] - - [567, 9912.79] + - [571, 9912.79] - - [1024, 4096, 1, 3939] - - [567, 9952.16] + - [571, 9952.16] - - [4096, 1024, 1, 3500] - - [567, 9863.52] + - [571, 9863.52] - - [4096, 1024, 1, 3395] - - [574, 9883.72] + - [578, 9883.72] - - [4096, 1024, 1, 3968] - - [574, 9920.26] + - [578, 9920.26] - - [4096, 1024, 1, 4020] - - [574, 9912.71] + - [578, 9912.71] - - [4096, 1024, 1, 3942] - - [573, 9910.07] + - [577, 9910.07] - - [1024, 1024, 1, 4032] - - [563, 9024.64] + - [567, 9024.64] - - [4096, 1024, 1, 3349] - - [574, 9865.94] + - [578, 9865.94] - - [1024, 4096, 1, 3322] - - [567, 9908.33] + - [571, 9908.33] - - [4096, 1024, 1, 3452] - - [573, 9872.59] + - [577, 9872.59] - - [1024, 4096, 1, 3417] - - [573, 9912.54] + - [577, 9912.54] - - [1024, 1024, 1, 4012] - - [571, 9085.37] + - [575, 9085.37] - - [1024, 4096, 1, 3526] - - [567, 9920.26] + - [571, 9920.26] - - [4096, 1024, 1, 3485] - - [567, 9861.54] + - [571, 9861.54] - - [1024, 1024, 1, 3681] - - [571, 8991.36] + - [575, 8991.36] - - [4096, 1024, 1, 3303] - - [574, 9861.2] + - [578, 9861.2] - - [4096, 1024, 1, 3344] - - [574, 9892.34] + - [578, 9892.34] - - [1024, 4096, 1, 3479] - - [574, 9921.67] + - [578, 9921.67] - - [4096, 1024, 1, 3300] - - [573, 9868.54] + - [577, 9868.54] - - [1024, 4096, 1, 3439] - - [567, 9918.19] + - [571, 9918.19] - - [4096, 1024, 1, 3280] - - [574, 9875.19] + - [578, 9875.19] - - [1024, 4096, 1, 3245] - - [566, 9910.39] + - [570, 9910.39] - - [1024, 4096, 1, 3328] - - [566, 9941.5] + - [570, 9941.5] - - [4096, 1024, 1, 3418] - - [566, 9870.66] + - [570, 9870.66] - - [1024, 4096, 1, 3493] - - [574, 9938.35] + - [578, 9938.35] - - [1024, 4096, 1, 3500] - - [566, 9916.83] + - [570, 9916.83] - - [1024, 4096, 1, 3166] - - [566, 9898.02] + - [570, 9898.02] - - [4096, 1024, 1, 3126] - - [567, 9846.94] + - [571, 9846.94] - - [1024, 4096, 1, 3277] - - [574, 9898.56] + - [578, 9898.56] - - [1024, 4096, 1, 3315] - - [573, 9923.01] + - [577, 9923.01] - - [1024, 1024, 1, 3927] - - [564, 8987.61] + - [568, 8987.61] - - [1024, 4096, 1, 3414] - - [566, 9915.91] + - [570, 9915.91] - - [4096, 1024, 1, 3531] - - [573, 9871.82] + - [577, 9871.82] - - [4096, 1024, 1, 3484] - - [566, 9867.76] + - [570, 9867.76] - - [1024, 4096, 1, 3180] - - [573, 9903.99] + - [577, 9903.99] - - [4096, 1024, 1, 3360] - - [573, 9879.47] + - [577, 9879.47] - - [1024, 33708, 1, 3990] - - [566, 10334.9] + - [570, 10334.9] - - [4096, 1024, 1, 3466] - - [573, 9874.92] + - [577, 9874.92] - - [1024, 4096, 1, 3428] - - [566, 9915.92] + - [570, 9915.92] - - [1024, 4096, 1, 3137] - - [573, 9913.17] + - [577, 9913.17] - - [4096, 1024, 1, 4059] - - [573, 9901.76] + - [577, 9901.76] - - [1024, 4096, 1, 3353] - - [573, 9914.5] + - [577, 9914.5] - - [1024, 4096, 1, 3942] - - [573, 9944.4] + - [577, 9944.4] - - [4096, 1024, 1, 3506] - - [566, 9875.65] + - [570, 9875.65] - - [1024, 1024, 1, 3894] - - [564, 8946.45] + - [568, 8946.45] - - [4096, 1024, 1, 3508] - - [574, 9877.57] + - [578, 9877.57] - - [64, 132, 480, 135] - - [557, 6164.76] + - [561, 6164.76] - - [4096, 1024, 1, 3956] - - [566, 9907.73] + - [570, 9907.73] - - [64, 7, 8192, 7] - - [545, 812.978] + - [549, 812.978] - - [1024, 4096, 1, 3272] - - [567, 9909.72] + - [571, 9909.72] - - [1024, 4096, 1, 3443] - - [574, 9929.73] + - [578, 9929.73] - - [1024, 4096, 1, 3375] - - [574, 9909.13] + - [578, 9909.13] - - [1024, 4096, 1, 3525] - - [574, 9929.17] + - [578, 9929.17] - - [4096, 1024, 1, 3472] - - [573, 9889.87] + - [577, 9889.87] - - [1024, 4096, 1, 3520] - - [566, 9947.69] + - [570, 9947.69] - - [4096, 1024, 1, 3322] - - [573, 9862.88] + - [577, 9862.88] - - [4096, 1024, 1, 3387] - - [573, 9861.52] + - [577, 9861.52] - - [64, 8, 7280, 8] - - [551, 1024.0] + - [555, 1024.0] - - [1024, 33708, 1, 3939] - - [566, 10339.8] + - [570, 10339.8] - - [4096, 1024, 1, 3345] - - [574, 9873.58] + - [578, 9873.58] - - [4096, 1024, 1, 2967] - - [573, 9839.11] + - [577, 9839.11] - - [1024, 4096, 1, 3453] - - [566, 9905.71] + - [570, 9905.71] - - [1024, 4096, 1, 3640] - - [573, 9933.95] + - [577, 9933.95] - - [4096, 1024, 1, 3291] - - [567, 9860.74] + - [571, 9860.74] - - [1024, 4096, 1, 3350] - - [574, 9917.93] + - [578, 9917.93] - - [4096, 1024, 1, 3417] - - [573, 9864.51] + - [577, 9864.51] - - [64, 135, 480, 135] - - [557, 6265.35] + - [561, 6265.35] - - [1024, 4096, 1, 3467] - - [567, 9906.85] + - [571, 9906.85] - - [1024, 4096, 1, 3491] - - [573, 9933.2] + - [577, 9933.2] - - [1024, 4096, 1, 3822] - - [573, 9938.65] + - [577, 9938.65] - - [4096, 1024, 1, 3292] - - [573, 9849.11] + - [577, 9849.11] - - [1024, 4096, 1, 3231] - - [566, 9905.72] + - [570, 9905.72] - - [1024, 4096, 1, 3364] - - [567, 9930.22] + - [571, 9930.22] - - [1024, 4096, 1, 3995] - - [567, 9943.66] + - [571, 9943.66] - - [1024, 4096, 1, 3545] - - [566, 9928.43] + - [570, 9928.43] - - [1024, 1024, 1, 3876] - - [564, 9002.94] + - [568, 9002.94] - - [1024, 4096, 1, 3186] - - [566, 9920.91] + - [570, 9920.91] - - [4096, 1024, 1, 3432] - - [573, 9875.19] + - [577, 9875.19] - - [64, 84, 752, 85] - - [544, 5704.41] + - [548, 5704.41] - - [4096, 1024, 1, 3367] - - [567, 9867.96] + - [571, 9867.96] - - [4096, 1024, 1, 3503] - - [574, 9870.91] + - [578, 9870.91] - - [1024, 4096, 1, 3095] - - [567, 9902.8] + - [571, 9902.8] - - [4096, 1024, 1, 3465] - - [574, 9872.07] + - [578, 9872.07] - - [1024, 4096, 1, 3402] - - [573, 9914.56] + - [577, 9914.56] - - [4096, 1024, 1, 3140] - - [573, 9847.85] + - [577, 9847.85] - - [1024, 1024, 1, 4050] - - [570, 9055.65] + - [574, 9055.65] - - [4096, 1024, 1, 3424] - - [567, 9894.52] + - [571, 9894.52] - - [4096, 1024, 1, 3257] - - [566, 9860.87] + - [570, 9860.87] - - [4096, 1024, 1, 2917] - - [573, 9845.81] + - [577, 9845.81] - - [1024, 33708, 1, 3640] - - [566, 10321.6] + - [570, 10321.6] - - [1024, 4096, 1, 3456] - - [566, 9950.25] + - [570, 9950.25] - - [1024, 4096, 1, 3014] - - [566, 9907.87] + - [570, 9907.87] - - [4096, 1024, 1, 3372] - - [574, 9868.27] + - [578, 9868.27] - - [64, 132, 480, 132] - - [557, 6121.52] + - [561, 6121.52] - - [1024, 4096, 1, 3294] - - [574, 9903.13] + - [578, 9903.13] - - [4096, 1024, 1, 3446] - - [574, 9871.59] + - [578, 9871.59] - - [1024, 4096, 1, 3389] - - [567, 9909.17] + - [571, 9909.17] - - [4096, 1024, 1, 3259] - - [573, 9860.66] + - [577, 9860.66] - - [4096, 1024, 1, 3544] - - [573, 9878.66] + - [577, 9878.66] - - [4096, 1024, 1, 3479] - - [574, 9873.87] + - [578, 9873.87] - - [4096, 1024, 1, 3542] - - [573, 9878.87] + - [577, 9878.87] - - [4096, 1024, 1, 3321] - - [566, 9861.03] + - [570, 9861.03] - - [1024, 4096, 1, 3147] - - [566, 9894.67] + - [570, 9894.67] - - [1024, 4096, 1, 3944] - - [566, 9950.41] + - [570, 9950.41] - - [4096, 1024, 1, 3870] - - [574, 9881.64] + - [578, 9881.64] - - [1024, 4096, 1, 3308] - - [566, 9907.16] + - [570, 9907.16] - - [4096, 1024, 1, 3401] - - [573, 9864.49] + - [577, 9864.49] - - [1024, 4096, 1, 3395] - - [566, 9928.93] + - [570, 9928.93] - - [64, 99, 624, 102] - - [542, 5651.26] + - [546, 5651.26] - - [1024, 4096, 1, 3563] - - [573, 9922.66] + - [577, 9922.66] - - [1024, 33708, 1, 3870] - - [566, 10325.3] + - [570, 10325.3] - - [4096, 1024, 1, 3494] - - [573, 9875.27] + - [577, 9875.27] - - [1024, 4096, 1, 3271] - - [566, 9912.99] + - [570, 9912.99] - - [1024, 33708, 1, 3910] - - [566, 10341.4] + - [570, 10341.4] - - [1024, 4096, 1, 3287] - - [574, 9924.77] + - [578, 9924.77] - - [1024, 33708, 1, 3860] - - [566, 10330.6] + - [570, 10330.6] - - [64, 143, 432, 148] - - [559, 6571.68] + - [563, 6571.68] - - [1024, 1024, 1, 3584] - - [571, 8975.21] + - [575, 8975.21] - - [64, 162, 400, 162] - - [561, 6822.16] + - [565, 6822.16] - - [4096, 1024, 1, 3341] - - [573, 9854.56] + - [577, 9854.56] - - [1024, 4096, 1, 3136] - - [566, 9926.76] + - [570, 9926.76] - - [4096, 1024, 1, 3439] - - [573, 9854.23] + - [577, 9854.23] - - [64, 148, 432, 147] - - [557, 6677.51] + - [561, 6677.51] - - [1024, 4096, 1, 3751] - - [573, 9938.38] + - [577, 9938.38] - - [1024, 4096, 1, 3301] - - [573, 9919.05] + - [577, 9919.05] - - [4096, 1024, 1, 3468] - - [574, 9859.73] + - [578, 9859.73] - - [1024, 4096, 1, 3416] - - [574, 9918.42] + - [578, 9918.42] - - [4096, 1024, 1, 3163] - - [573, 9854.55] + - [577, 9854.55] - - [1024, 4096, 1, 3230] - - [567, 9897.44] + - [571, 9897.44] - - [1024, 4096, 1, 3581] - - [567, 9915.38] + - [571, 9915.38] - - [1024, 1024, 1, 3960] - - [569, 9045.76] + - [573, 9045.76] - - [4096, 1024, 1, 3463] - - [574, 9884.64] + - [578, 9884.64] - - [1024, 4096, 1, 3478] - - [567, 9926.92] + - [571, 9926.92] - - [4096, 1024, 1, 3262] - - [573, 9852.12] + - [577, 9852.12] - - [1024, 4096, 1, 3438] - - [573, 9912.58] + - [577, 9912.58] - - [1024, 4096, 1, 3244] - - [566, 9900.41] + - [570, 9900.41] - - [1024, 4096, 1, 3445] - - [566, 9920.22] + - [570, 9920.22] - - [4096, 1024, 1, 3328] - - [573, 9887.97] + - [577, 9887.97] - - [1024, 4096, 1, 3492] - - [567, 9937.12] + - [571, 9937.12] - - [4096, 1024, 1, 3211] - - [567, 9847.85] + - [571, 9847.85] - - [1024, 4096, 1, 3910] - - [574, 9946.47] + - [578, 9946.47] - - [1024, 4096, 1, 3314] - - [566, 9932.5] + - [570, 9932.5] - - [4096, 1024, 1, 3859] - - [573, 9902.74] + - [577, 9902.74] - - [4096, 1024, 1, 3383] - - [573, 9875.1] + - [577, 9875.1] - - [1024, 4096, 1, 3409] - - [574, 9926.69] + - [578, 9926.69] - - [1024, 4096, 1, 4020] - - [566, 9941.7] + - [570, 9941.7] - - [4096, 1024, 1, 3530] - - [573, 9872.71] + - [577, 9872.71] - - [4096, 1024, 1, 3411] - - [574, 9874.92] + - [578, 9874.92] - - [1024, 4096, 1, 3566] - - [574, 9921.0] + - [578, 9921.0] - - [4096, 1024, 1, 3493] - - [566, 9875.64] + - [570, 9875.64] - - [4096, 1024, 1, 3184] - - [573, 9873.04] + - [577, 9873.04] - - [1024, 4096, 1, 3072] - - [566, 9923.69] + - [570, 9923.69] - - [1024, 4096, 1, 3431] - - [567, 9910.93] + - [571, 9910.93] - - [4096, 1024, 1, 3306] - - [574, 9853.32] + - [578, 9853.32] - - [1024, 4096, 1, 3352] - - [574, 9913.22] + - [578, 9913.22] - - [4096, 1024, 1, 3295] - - [573, 9862.58] + - [577, 9862.58] - - [64, 123, 528, 122] - - [539, 6950.15] + - [543, 6950.15] - - [1024, 4096, 1, 3517] - - [567, 9919.96] + - [571, 9919.96] - - [64, 102, 624, 101] - - [547, 5791.39] + - [551, 5791.39] - - [4096, 1024, 1, 3426] - - [573, 9891.04] + - [577, 9891.04] - - [4096, 1024, 1, 3385] - - [573, 9868.31] + - [577, 9868.31] - - [1024, 1024, 1, 3978] - - [564, 9008.38] + - [568, 9008.38] - - [4096, 1024, 1, 3572] - - [566, 9884.71] + - [570, 9884.71] - - [4096, 1024, 1, 3459] - - [573, 9892.07] + - [577, 9892.07] - - [1024, 4096, 1, 3374] - - [574, 9908.42] + - [578, 9908.42] - - [4096, 1024, 1, 3166] - - [573, 9832.35] + - [577, 9832.35] - - [4096, 1024, 1, 3093] - - [574, 9841.15] + - [578, 9841.15] - - [4096, 1024, 1, 3523] - - [567, 9878.95] + - [571, 9878.95] - - [4096, 1024, 1, 3413] - - [567, 9880.71] + - [571, 9880.71] - - [1024, 4096, 1, 3996] - - [566, 9948.04] + - [570, 9948.04] - - [1024, 4096, 1, 3452] - - [574, 9915.87] + - [578, 9915.87] - - [4096, 1024, 1, 3232] - - [574, 9876.44] + - [578, 9876.44] - - [4096, 1024, 1, 3400] - - [566, 9867.05] + - [570, 9867.05] - - [4096, 1024, 1, 3334] - - [573, 9868.89] + - [577, 9868.89] - - [1024, 4096, 1, 3345] - - [566, 9920.5] + - [570, 9920.5] - - [1024, 4096, 1, 3538] - - [573, 9933.24] + - [577, 9933.24] - - [1024, 4096, 1, 3466] - - [573, 9920.75] + - [577, 9920.75] - - [4096, 1024, 1, 3315] - - [573, 9876.77] + - [577, 9876.77] - - [4096, 1024, 1, 3214] - - [574, 9847.83] + - [578, 9847.83] - - [1024, 33708, 1, 3900] - - [566, 10331.6] + - [570, 10331.6] - - [64, 160, 400, 160] - - [559, 7440.51] + - [563, 7440.51] - - [1024, 4096, 1, 3367] - - [573, 9926.22] + - [577, 9926.22] - - [1024, 4096, 1, 2917] - - [574, 9904.47] + - [578, 9904.47] - - [1024, 1024, 1, 3995] - - [565, 9000.23] + - [569, 9000.23] - - [64, 132, 480, 134] - - [557, 6146.78] + - [561, 6146.78] - - [1024, 4096, 1, 3544] - - [574, 9924.04] + - [578, 9924.04] - - [4096, 1024, 1, 3414] - - [574, 9867.8] + - [578, 9867.8] - - [4096, 1024, 1, 3565] - - [567, 9870.03] + - [571, 9870.03] - - [1024, 4096, 1, 3512] - - [573, 9919.74] + - [577, 9919.74] - - [1024, 4096, 1, 3191] - - [574, 9914.69] + - [578, 9914.69] - - [64, 27, 2336, 27] - - [541, 3054.61] + - [545, 3054.61] - - [1024, 4096, 1, 3289] - - [574, 9917.1] + - [578, 9917.1] - - [4096, 1024, 1, 3290] - - [573, 9858.31] + - [577, 9858.31] - - [1024, 4096, 1, 3211] - - [574, 9897.06] + - [578, 9897.06] - - [1024, 33708, 1, 3969] - - [567, 10336.0] + - [571, 10336.0] - - [4096, 1024, 1, 3566] - - [573, 9862.9] + - [577, 9862.9] - - [64, 111, 576, 111] - - [547, 6400.81] + - [551, 6400.81] - - [1024, 4096, 1, 3459] - - [573, 9922.93] + - [577, 9922.93] - - [1024, 4096, 1, 3372] - - [566, 9909.76] + - [570, 9909.76] - - [4096, 1024, 1, 3339] - - [573, 9859.2] + - [577, 9859.2] - - [4096, 1024, 1, 3425] - - [573, 9889.24] + - [577, 9889.24] - - [4096, 1024, 1, 3388] - - [573, 9871.57] + - [577, 9871.57] - - [1024, 4096, 1, 3531] - - [566, 9918.9] + - [570, 9918.9] - - [4096, 1024, 1, 3286] - - [574, 9868.32] + - [578, 9868.32] - - [4096, 1024, 1, 3462] - - [573, 9881.78] + - [577, 9881.78] - - [1024, 4096, 1, 3388] - - [566, 9904.59] + - [570, 9904.59] - - [4096, 1024, 1, 3165] - - [566, 9836.23] + - [570, 9836.23] - - [4096, 1024, 1, 3304] - - [573, 9857.45] + - [577, 9857.45] - - [1024, 4096, 1, 2736] - - [573, 9900.97] + - [577, 9900.97] - - [4096, 1024, 1, 3397] - - [573, 9872.0] + - [577, 9872.0] - - [64, 38, 1680, 38] - - [540, 3459.42] + - [544, 3459.42] - - [1024, 4096, 1, 3311] - - [574, 9908.22] + - [578, 9908.22] - - [1024, 4096, 1, 3394] - - [574, 9929.33] + - [578, 9929.33] - - [4096, 1024, 1, 2736] - - [573, 9833.78] + - [577, 9833.78] - - [1024, 4096, 1, 3559] - - [567, 9925.23] + - [571, 9925.23] - - [4096, 1024, 1, 3180] - - [573, 9837.95] + - [577, 9837.95] - - [1024, 4096, 1, 3480] - - [566, 9922.36] + - [570, 9922.36] - - [4096, 1024, 1, 3318] - - [573, 9867.77] + - [577, 9867.77] - - [4096, 1024, 1, 3213] - - [573, 9845.92] + - [577, 9845.92] - - [1024, 4096, 1, 3286] - - [573, 9912.04] + - [577, 9912.04] - - [4096, 1024, 1, 3471] - - [573, 9874.14] + - [577, 9874.14] - - [1024, 4096, 1, 3381] - - [574, 9922.86] + - [578, 9922.86] - - [64, 100, 624, 100] - - [548, 5705.14] + - [552, 5705.14] - - [4096, 1024, 1, 3502] - - [573, 9872.34] + - [577, 9872.34] - - [64, 16, 3840, 16] - - [554, 2091.57] + - [558, 2091.57] - - [1024, 4096, 1, 3552] - - [566, 9943.79] + - [570, 9943.79] - - [4096, 1024, 1, 3519] - - [574, 9869.85] + - [578, 9869.85] - - [1024, 4096, 1, 3300] - - [567, 9916.05] + - [571, 9916.05] - - [1024, 4096, 1, 3419] - - [566, 9913.96] + - [570, 9913.96] - - [4096, 1024, 1, 4030] - - [567, 9893.63] + - [571, 9893.63] - - [4096, 1024, 1, 3976] - - [574, 9898.25] + - [578, 9898.25] - - [1024, 4096, 1, 3473] - - [574, 9928.32] + - [578, 9928.32] - - [1024, 1024, 1, 3977] - - [571, 9009.23] + - [575, 9009.23] - - [4096, 1024, 1, 3428] - - [573, 9876.69] + - [577, 9876.69] - - [1024, 4096, 1, 3433] - - [567, 9923.82] + - [571, 9923.82] - - [4096, 1024, 1, 3534] - - [567, 9863.9] + - [571, 9863.9] - - [4096, 1024, 1, 3461] - - [573, 9873.02] + - [577, 9873.02] - - [4096, 1024, 1, 3681] - - [573, 9898.47] + - [577, 9898.47] - - [4096, 1024, 1, 3495] - - [574, 9875.98] + - [578, 9875.98] - - [4096, 1024, 1, 3351] - - [573, 9879.61] + - [577, 9879.61] - - [1024, 4096, 1, 4059] - - [566, 9948.51] + - [570, 9948.51] - - [4096, 1024, 1, 3990] - - [573, 9900.66] + - [577, 9900.66] - - [1024, 4096, 1, 3325] - - [567, 9903.2] + - [571, 9903.2] - - [1024, 4096, 1, 3408] - - [573, 9932.05] + - [577, 9932.05] - - [64, 59, 1088, 59] - - [547, 5343.67] + - [551, 5343.67] - - [4096, 1024, 1, 3394] - - [574, 9878.07] + - [578, 9878.07] - - [1024, 4096, 1, 3573] - - [574, 9935.2] + - [578, 9935.2] - - [4096, 1024, 1, 3386] - - [573, 9866.28] + - [577, 9866.28] - - [4096, 1024, 1, 3540] - - [573, 9882.23] + - [577, 9882.23] - - [1024, 4096, 1, 3182] - - [567, 9894.35] + - [571, 9894.35] - - [1024, 4096, 1, 3430] - - [566, 9915.14] + - [570, 9915.14] - - [1024, 4096, 1, 3236] - - [574, 9920.46] + - [578, 9920.46] - - [4096, 1024, 1, 2977] - - [573, 9847.98] + - [577, 9847.98] - - [1024, 4096, 1, 3355] - - [573, 9908.68] + - [577, 9908.68] - - [4096, 1024, 1, 3139] - - [573, 9850.61] + - [577, 9850.61] - - [4096, 1024, 1, 3516] - - [567, 9874.11] + - [571, 9874.11] - - [4096, 1024, 1, 3368] - - [567, 9872.54] + - [571, 9872.54] - - [4096, 1024, 1, 3559] - - [566, 9884.22] + - [570, 9884.22] - - [64, 11, 5456, 11] - - [554, 1382.57] + - [558, 1382.57] - - [1024, 4096, 1, 3506] - - [573, 9937.59] + - [577, 9937.59] - - [1024, 4096, 1, 3145] - - [566, 9905.01] + - [570, 9905.01] - - [1024, 4096, 1, 3369] - - [573, 9912.61] + - [577, 9912.61] - - [64, 112, 576, 112] - - [539, 6583.46] + - [543, 6583.46] - - [4096, 1024, 1, 3522] - - [573, 9889.37] + - [577, 9889.37] - - [1024, 33708, 1, 3894] - - [566, 10337.4] + - [570, 10337.4] - - [64, 159, 400, 162] - - [557, 7056.99] + - [561, 7056.99] - - [4096, 1024, 1, 3336] - - [573, 9867.57] + - [577, 9867.57] - - [1024, 4096, 1, 3382] - - [567, 9915.8] + - [571, 9915.8] - - [4096, 1024, 1, 3533] - - [573, 9878.46] + - [577, 9878.46] - - [4096, 1024, 1, 4050] - - [574, 9916.72] + - [578, 9916.72] - - [4096, 1024, 1, 3480] - - [567, 9869.22] + - [571, 9869.22] - - [1024, 4096, 1, 3344] - - [566, 9935.51] + - [570, 9935.51] - - [64, 122, 528, 122] - - [539, 6871.04] + - [543, 6871.04] - - [1024, 4096, 1, 3509] - - [567, 9925.7] + - [571, 9925.7] - - [1024, 4096, 1, 3956] - - [566, 9958.16] + - [570, 9958.16] - - [4096, 1024, 1, 3616] - - [573, 9904.53] + - [577, 9904.53] - - [1024, 4096, 1, 3366] - - [566, 9919.37] + - [570, 9919.37] - - [4096, 1024, 1, 2935] - - [566, 9833.13] + - [570, 9833.13] - - [4096, 1024, 1, 3393] - - [573, 9877.35] + - [577, 9877.35] - - [4096, 1024, 1, 3547] - - [567, 9865.0] + - [571, 9865.0] - - [1024, 4096, 1, 3499] - - [574, 9912.39] + - [578, 9912.39] - - [4096, 1024, 1, 3357] - - [573, 9855.18] + - [577, 9855.18] - - [4096, 1024, 1, 3272] - - [573, 9861.87] + - [577, 9861.87] - - [4096, 1024, 1, 3207] - - [573, 9847.68] + - [577, 9847.68] - - [4096, 1024, 1, 3894] - - [573, 9918.76] + - [577, 9918.76] - - [1024, 4096, 1, 3444] - - [573, 9932.61] + - [577, 9932.61] - - [4096, 1024, 1, 3561] - - [573, 9872.51] + - [577, 9872.51] - - [4096, 1024, 1, 3376] - - [573, 9885.49] + - [577, 9885.49] - - [1024, 4096, 1, 3458] - - [573, 9929.29] + - [577, 9929.29] - - [4096, 1024, 1, 3231] - - [567, 9846.98] + - [571, 9846.98] - - [64, 228, 272, 228] - - [568, 7302.59] + - [572, 7302.59] - - [1024, 4096, 1, 3505] - - [574, 9931.53] + - [578, 9931.53] - - [4096, 1024, 1, 3277] - - [573, 9857.1] + - [577, 9857.1] - - [64, 21, 2976, 21] - - [543, 2436.04] + - [547, 2436.04] - - [1024, 4096, 1, 3391] - - [573, 9911.15] + - [577, 9911.15] - - [64, 32, 1984, 32] - - [555, 3572.07] + - [559, 3572.07] - - [1024, 4096, 1, 3536] - - [574, 9946.8] + - [578, 9946.8] - - [1024, 4096, 1, 3063] - - [573, 9906.82] + - [577, 9906.82] - - [1024, 1024, 1, 3925] - - [565, 9011.35] + - [569, 9011.35] - - [1024, 4096, 1, 3189] - - [567, 9900.85] + - [571, 9900.85] - - [1024, 4096, 1, 2505] - - [573, 9854.75] + - [577, 9854.75] - - [4096, 1024, 1, 3454] - - [566, 9864.86] + - [570, 9864.86] - - [1024, 4096, 1, 3405] - - [574, 9906.23] + - [578, 9906.23] - - [1024, 33708, 1, 4050] - - [567, 10343.6] + - [571, 10343.6] - - [4096, 1024, 1, 3520] - - [573, 9886.93] + - [577, 9886.93] - - [64, 93, 688, 93] - - [550, 6222.76] + - [554, 6222.76] - - [1024, 4096, 1, 3487] - - [574, 9918.59] + - [578, 9918.59] - - [1024, 4096, 1, 3558] - - [574, 9930.89] + - [578, 9930.89] - - [4096, 1024, 1, 3297] - - [573, 9874.21] + - [577, 9874.21] - - [1024, 1024, 1, 3840] - - [569, 9075.32] + - [573, 9075.32] - - [1024, 4096, 1, 3483] - - [573, 9915.28] + - [577, 9915.28] - - [1024, 1024, 1, 3956] - - [572, 9009.93] + - [576, 9009.93] - - [1024, 33708, 1, 3751] - - [567, 10325.8] + - [571, 10325.8] - - [4096, 1024, 1, 3380] - - [573, 9888.37] + - [577, 9888.37] - - [1024, 4096, 1, 3380] - - [566, 9927.15] + - [570, 9927.15] - - [1024, 4096, 1, 3396] - - [574, 9931.86] + - [578, 9931.86] - - [1024, 4096, 1, 3497] - - [567, 9914.76] + - [571, 9914.76] - - [1024, 4096, 1, 3502] - - [574, 9921.42] + - [578, 9921.42] - - [1024, 1024, 1, 3976] - - [569, 9060.2] + - [573, 9060.2] - - [1024, 4096, 1, 3138] - - [567, 9908.56] + - [571, 9908.56] - - [4096, 1024, 1, 3939] - - [566, 9910.13] + - [570, 9910.13] - - [1024, 4096, 1, 3303] - - [567, 9916.54] + - [571, 9916.54] - - [64, 111, 576, 112] - - [547, 6495.09] + - [551, 6495.09] - - [1024, 4096, 1, 3418] - - [573, 9913.25] + - [577, 9913.25] - - [1024, 4096, 1, 3224] - - [567, 9903.95] + - [571, 9903.95] - - [4096, 1024, 1, 3978] - - [573, 9896.18] + - [577, 9896.18] - - [1024, 4096, 1, 3472] - - [566, 9937.38] + - [570, 9937.38] - - [4096, 1024, 1, 3353] - - [574, 9863.87] + - [578, 9863.87] - - [4096, 1024, 1, 3362] - - [573, 9870.96] + - [577, 9870.96] - - [1024, 33708, 1, 3978] - - [566, 10325.3] + - [570, 10325.3] - - [64, 100, 624, 102] - - [542, 5695.57] + - [546, 5695.57] - - [1024, 4096, 1, 3432] - - [574, 9915.46] + - [578, 9915.46] - - [1024, 4096, 1, 3139] - - [573, 9914.11] + - [577, 9914.11] - - [1024, 4096, 1, 3341] - - [574, 9912.0] + - [578, 9912.0] - - [1024, 4096, 1, 3494] - - [567, 9924.5] + - [571, 9924.5] - - [1024, 4096, 1, 3969] - - [566, 9952.18] + - [570, 9952.18] - - [1024, 4096, 1, 3163] - - [574, 9911.69] + - [578, 9911.69] - - [1024, 1024, 1, 3955] - - [564, 9097.76] + - [568, 9097.76] - - [4096, 1024, 1, 3405] - - [573, 9853.74] + - [577, 9853.74] - - [1024, 1024, 1, 4030] - - [564, 9083.76] + - [568, 9083.76] - - [4096, 1024, 1, 3453] - - [573, 9858.78] + - [577, 9858.78] - - [1024, 4096, 1, 3411] - - [574, 9926.44] + - [578, 9926.44] - - [1024, 4096, 1, 3527] - - [567, 9922.55] + - [571, 9922.55] - - [4096, 1024, 1, 3474] - - [573, 9878.39] + - [577, 9878.39] - - [1024, 4096, 1, 3572] - - [573, 9931.9] + - [577, 9931.9] - - [4096, 1024, 1, 3293] - - [573, 9848.16] + - [577, 9848.16] - - [4096, 1024, 1, 3247] - - [573, 9861.35] + - [577, 9861.35] - - [64, 15, 4096, 15] - - [554, 1955.65] + - [558, 1955.65] - - [1024, 4096, 1, 3425] - - [574, 9936.3] + - [578, 9936.3] - - [1024, 4096, 1, 3354] - - [566, 9917.45] + - [570, 9917.45] - - [4096, 1024, 1, 3382] - - [573, 9885.39] + - [577, 9885.39] - - [4096, 1024, 1, 3236] - - [573, 9860.5] + - [577, 9860.5] - - [1024, 4096, 1, 3519] - - [574, 9919.2] + - [578, 9919.2] - - [4096, 1024, 1, 3354] - - [573, 9854.65] + - [577, 9854.65] - - [4096, 1024, 1, 3501] - - [574, 9869.52] + - [578, 9869.52] - - [1024, 1024, 1, 3906] - - [572, 9104.89] + - [576, 9104.89] - - [4096, 1024, 1, 3266] - - [573, 9873.87] + - [577, 9873.87] - - [64, 101, 624, 102] - - [542, 5765.42] + - [546, 5765.42] - - [1024, 4096, 1, 3368] - - [573, 9909.67] + - [577, 9909.67] - - [1024, 4096, 1, 4030] - - [574, 9940.17] + - [578, 9940.17] - - [1024, 4096, 1, 3533] - - [567, 9916.54] + - [571, 9916.54] - - [4096, 1024, 1, 3332] - - [574, 9876.35] + - [578, 9876.35] - - [4096, 1024, 1, 3584] - - [573, 9896.5] + - [577, 9896.5] - - [1024, 4096, 1, 3616] - - [573, 9957.08] + - [577, 9957.08] - - [4096, 1024, 1, 3265] - - [573, 9877.68] + - [577, 9877.68] - - [4096, 1024, 1, 3361] - - [573, 9888.51] + - [577, 9888.51] - - [4096, 1024, 1, 3467] - - [573, 9863.3] + - [577, 9863.3] - - [1024, 4096, 1, 3454] - - [567, 9904.79] + - [571, 9904.79] - - [1024, 4096, 1, 3101] - - [574, 9893.02] + - [578, 9893.02] - - [1024, 4096, 1, 3508] - - [574, 9931.44] + - [578, 9931.44] - - [4096, 1024, 1, 3267] - - [573, 9864.38] + - [577, 9864.38] - - [64, 54, 1184, 54] - - [539, 4905.92] + - [543, 4905.92] - - [4096, 1024, 1, 3419] - - [573, 9872.46] + - [577, 9872.46] - - [4096, 1024, 1, 3822] - - [573, 9892.53] + - [577, 9892.53] - - [1024, 4096, 1, 3266] - - [573, 9918.48] + - [577, 9918.48] - - [4096, 1024, 1, 3440] - - [574, 9890.06] + - [578, 9890.06] - - [1024, 4096, 1, 3361] - - [573, 9930.87] + - [577, 9930.87] - - [1024, 4096, 1, 3546] - - [567, 9926.46] + - [571, 9926.46] - - [4096, 1024, 1, 3473] - - [573, 9888.96] + - [577, 9888.96] - - [4096, 1024, 1, 3546] - - [574, 9872.17] + - [578, 9872.17] - - [1024, 4096, 1, 3088] - - [567, 9917.93] + - [571, 9917.93] - - [1024, 4096, 1, 3535] - - [574, 9921.1] + - [578, 9921.1] - - [1024, 4096, 1, 3447] - - [574, 9920.53] + - [578, 9920.53] - - [1024, 4096, 1, 3560] - - [573, 9925.38] + - [577, 9925.38] - - [1024, 4096, 1, 3422] - - [567, 9922.11] + - [571, 9922.11] - - [1024, 4096, 1, 3469] - - [566, 9906.08] + - [570, 9906.08] - - [4096, 1024, 1, 3488] - - [573, 9903.16] + - [577, 9903.16] - - [1024, 4096, 1, 3110] - - [573, 9906.66] + - [577, 9906.66] - - [1024, 4096, 1, 3265] - - [574, 9916.59] + - [578, 9916.59] - - [1024, 4096, 1, 3291] - - [573, 9902.63] + - [577, 9902.63] - - [1024, 4096, 1, 3390] - - [574, 9907.12] + - [578, 9907.12] - - [4096, 1024, 1, 3046] - - [573, 9847.58] + - [577, 9847.58] - - [1024, 4096, 1, 3539] - - [574, 9933.39] + - [578, 9933.39] - - [4096, 1024, 1, 3221] - - [574, 9860.64] + - [578, 9860.64] - - [4096, 1024, 1, 3433] - - [573, 9872.64] + - [577, 9872.64] - - [4096, 1024, 1, 3364] - - [574, 9881.81] + - [578, 9881.81] - - [4096, 1024, 1, 3470] - - [573, 9858.46] + - [577, 9858.46] - - [1024, 4096, 1, 3404] - - [566, 9907.17] + - [570, 9907.17] - - [1024, 33708, 1, 3968] - - [567, 10350.2] + - [571, 10350.2] - - [4096, 1024, 1, 3088] - - [573, 9868.96] + - [577, 9868.96] - - [1024, 4096, 1, 3247] - - [573, 9900.92] + - [577, 9900.92] - - [1024, 33708, 1, 3996] - - [566, 10328.4] + - [570, 10328.4] - - [4096, 1024, 1, 3482] - - [574, 9866.89] + - [578, 9866.89] - - [1024, 1024, 1, 3796] - - [569, 9031.58] + - [573, 9031.58] - - [4096, 1024, 1, 3995] - - [574, 9896.68] + - [578, 9896.68] - - [1024, 1024, 1, 3859] - - [571, 9097.26] + - [575, 9097.26] - - [1024, 4096, 1, 3280] - - [567, 9933.95] + - [571, 9933.95] - - [4096, 1024, 1, 3271] - - [574, 9859.99] + - [578, 9859.99] - - [64, 10, 5952, 10] - - [554, 1220.92] + - [558, 1220.92] - - [4096, 1024, 1, 3545] - - [573, 9877.25] + - [577, 9877.25] - - [4096, 1024, 1, 3476] - - [566, 9882.47] + - [570, 9882.47] - - [4096, 1024, 1, 3496] - - [567, 9880.4] + - [571, 9880.4] - - [4096, 1024, 1, 3191] - - [567, 9858.6] + - [571, 9858.6] - - [4096, 1024, 1, 3311] - - [574, 9853.1] + - [578, 9853.1] - - [1024, 4096, 1, 3302] - - [574, 9919.22] + - [578, 9919.22] - - [1024, 4096, 1, 3681] - - [573, 9944.89] + - [577, 9944.89] - - [4096, 1024, 1, 3582] - - [566, 9869.67] + - [570, 9869.67] - - [4096, 1024, 1, 3421] - - [574, 9855.98] + - [578, 9855.98] - - [4096, 1024, 1, 3560] - - [567, 9884.38] + - [571, 9884.38] - - [1024, 4096, 1, 3495] - - [574, 9930.03] + - [578, 9930.03] - - [4096, 1024, 1, 3186] - - [573, 9870.49] + - [577, 9870.49] - - [4096, 1024, 1, 3925] - - [573, 9903.9] + - [577, 9903.9] - - [64, 71, 896, 71] - - [558, 5004.69] + - [562, 5004.69] - - [1024, 4096, 1, 3435] - - [574, 9916.48] + - [578, 9916.48] - - [4096, 1024, 1, 3434] - - [573, 9871.19] + - [577, 9871.19] - - [1024, 33708, 1, 4012] - - [566, 10332.4] + - [570, 10332.4] - - [1024, 4096, 1, 3340] - - [566, 9918.01] + - [570, 9918.01] - - [1024, 1024, 1, 3860] - - [564, 8999.26] + - [568, 8999.26] - - [4096, 1024, 1, 3489] - - [573, 9881.92] + - [577, 9881.92] - - [1024, 4096, 1, 3162] - - [574, 9906.18] + - [578, 9906.18] - - [4096, 1024, 1, 3436] - - [573, 9858.02] + - [577, 9858.02] - - [1024, 1024, 1, 4005] - - [570, 9042.96] + - [574, 9042.96] - - [64, 84, 752, 84] - - [543, 5629.83] + - [547, 5629.83] - - [4096, 1024, 1, 3574] - - [573, 9886.6] + - [577, 9886.6] - - [4096, 1024, 1, 3469] - - [566, 9856.16] + - [570, 9856.16] - - [1024, 4096, 1, 3410] - - [567, 9924.64] + - [571, 9924.64] - - [1024, 4096, 1, 3216] - - [566, 9930.57] + - [570, 9930.57] - - [4096, 1024, 1, 3095] - - [573, 9846.91] + - [577, 9846.91] - - [1024, 1024, 1, 3990] - - [572, 9088.94] + - [576, 9088.94] - - [4096, 1024, 1, 3448] - - [573, 9863.84] + - [577, 9863.84] - - [1024, 4096, 1, 3176] - - [574, 9913.91] + - [578, 9913.91] - - [64, 49, 1296, 49] - - [539, 4437.36] + - [543, 4437.36] - - [4096, 1024, 1, 2918] - - [573, 9830.83] + - [577, 9830.83] - - [64, 14, 4368, 14] - - [553, 1802.37] + - [557, 1802.37] - - [1024, 4096, 1, 3424] - - [573, 9933.95] + - [577, 9933.95] - - [4096, 1024, 1, 3402] - - [566, 9863.02] + - [570, 9863.02] - - [4096, 1024, 1, 3145] - - [567, 9856.46] + - [571, 9856.46] - - [64, 134, 480, 134] - - [559, 6183.95] + - [563, 6183.95] - - [1024, 33708, 1, 3976] - - [567, 10330.0] + - [571, 10330.0] - - [4096, 1024, 1, 3518] - - [566, 9855.97] + - [570, 9855.97] - - [4096, 1024, 1, 3110] - - [573, 9856.36] + - [577, 9856.36] - - [4096, 1024, 1, 3325] - - [573, 9852.26] + - [577, 9852.26] - - [1024, 33708, 1, 3999] - - [566, 10329.6] + - [570, 10329.6] - - [4096, 1024, 1, 2985] - - [573, 9837.2] + - [577, 9837.2] - - [1024, 4096, 1, 3371] - - [566, 9912.93] + - [570, 9912.93] - - [4096, 1024, 1, 3342] - - [573, 9863.06] + - [577, 9863.06] - - [4096, 1024, 1, 3141] - - [567, 9849.81] + - [571, 9849.81] - - [4096, 1024, 1, 3532] - - [567, 9866.2] + - [571, 9866.2] - - [64, 78, 816, 78] - - [544, 5316.78] + - [548, 5316.78] - - [1024, 4096, 1, 3169] - - [574, 9910.35] + - [578, 9910.35] - - [1024, 4096, 1, 3514] - - [573, 9917.9] + - [577, 9917.9] - - [4096, 1024, 1, 3780] - - [574, 9899.65] + - [578, 9899.65] - - [1024, 4096, 1, 3098] - - [566, 9901.52] + - [570, 9901.52] - - [1024, 4096, 1, 3449] - - [574, 9919.75] + - [578, 9919.75] - - [1024, 4096, 1, 3222] - - [566, 9917.56] + - [570, 9917.56] - - [1024, 4096, 1, 3346] - - [567, 9912.81] + - [571, 9912.81] - - [4096, 1024, 1, 3064] - - [574, 9848.69] + - [578, 9848.69] - - [4096, 1024, 1, 3511] - - [573, 9873.29] + - [577, 9873.29] - - [4096, 1024, 1, 3384] - - [573, 9870.88] + - [577, 9870.88] - - [4096, 1024, 1, 3356] - - [567, 9853.35] + - [571, 9853.35] - - [1024, 4096, 1, 3796] - - [566, 9940.56] + - [570, 9940.56] - - [4096, 1024, 1, 3427] - - [573, 9883.04] + - [577, 9883.04] - - [4096, 1024, 1, 3390] - - [573, 9863.69] + - [577, 9863.69] - - [4096, 1024, 1, 3573] - - [574, 9885.92] + - [578, 9885.92] - - [4096, 1024, 1, 3456] - - [567, 9890.51] + - [571, 9890.51] - - [1024, 4096, 1, 3360] - - [574, 9938.0] + - [578, 9938.0] - - [1024, 33708, 1, 3977] - - [567, 10327.1] + - [571, 10327.1] - - [1024, 4096, 1, 2918] - - [566, 9902.74] + - [570, 9902.74] - - [4096, 1024, 1, 3975] - - [573, 9905.17] + - [577, 9905.17] - - [4096, 1024, 1, 3525] - - [574, 9879.81] + - [578, 9879.81] - - [4096, 1024, 1, 3398] - - [566, 9873.81] + - [570, 9873.81] - - [4096, 1024, 1, 3640] - - [573, 9885.06] + - [577, 9885.06] - - [1024, 1024, 1, 3999] - - [565, 8995.32] + - [569, 8995.32] - - [4096, 1024, 1, 3014] - - [573, 9841.22] + - [577, 9841.22] - - [1024, 4096, 1, 3446] - - [566, 9917.11] + - [570, 9917.11] - - [1024, 33708, 1, 3796] - - [566, 10338.9] + - [570, 10338.9] - - [4096, 1024, 1, 3101] - - [566, 9827.24] + - [570, 9827.24] - - [4096, 1024, 1, 3563] - - [574, 9862.93] + - [578, 9862.93] - - [4096, 1024, 1, 3539] - - [566, 9889.44] + - [570, 9889.44] - - [4096, 1024, 1, 3182] - - [573, 9833.69] + - [577, 9833.69] - - [1024, 4096, 1, 3468] - - [567, 9912.95] + - [571, 9912.95] - - [4096, 1024, 1, 3312] - - [573, 9889.75] + - [577, 9889.75] - - [4096, 1024, 1, 3215] - - [573, 9853.78] + - [577, 9853.78] - - [4096, 1024, 1, 3910] - - [573, 9894.62] + - [577, 9894.62] - - [1024, 33708, 1, 3780] - - [567, 10331.9] + - [571, 10331.9] - - [1024, 4096, 1, 3290] - - [573, 9914.98] + - [577, 9914.98] - - [1024, 4096, 1, 4012] - - [573, 9942.55] + - [577, 9942.55] - - [1024, 4096, 1, 3385] - - [573, 9915.73] + - [577, 9915.73] - - [1024, 33708, 1, 3975] - - [566, 10330.0] + - [570, 10330.0] - - [4096, 1024, 1, 3996] - - [573, 9891.21] + - [577, 9891.21] - - [4096, 1024, 1, 2765] - - [574, 9800.28] + - [578, 9800.28] - - [4096, 1024, 1, 3538] - - [574, 9886.12] + - [578, 9886.12] - - [4096, 1024, 1, 3415] - - [574, 9874.5] + - [578, 9874.5] - - [1024, 4096, 1, 3554] - - [573, 9931.89] + - [577, 9931.89] - - [4096, 1024, 1, 3513] - - [567, 9874.15] + - [571, 9874.15] - - [1024, 4096, 1, 3304] - - [567, 9907.63] + - [571, 9907.63] - - [4096, 1024, 1, 3294] - - [573, 9851.15] + - [577, 9851.15] - - [4096, 1024, 1, 3396] - - [574, 9880.6] + - [578, 9880.6] - - [1024, 4096, 1, 3213] - - [567, 9891.02] + - [571, 9891.02] - - [4096, 1024, 1, 3137] - - [567, 9857.31] + - [571, 9857.31] - - [4096, 1024, 1, 3552] - - [573, 9904.12] + - [577, 9904.12] - - [1024, 1024, 1, 4020] - - [572, 9098.77] + - [576, 9098.77] - - [64, 13, 4672, 13] - - [554, 1693.44] + - [558, 1693.44] - - [1024, 4096, 1, 3461] - - [573, 9918.35] + - [577, 9918.35] - - [4096, 1024, 1, 3263] - - [566, 9843.79] + - [570, 9843.79] - - [4096, 1024, 1, 3430] - - [573, 9885.16] + - [577, 9885.16] - - [4096, 1024, 1, 3389] - - [573, 9859.13] + - [577, 9859.13] - - [4096, 1024, 1, 3528] - - [573, 9872.91] + - [577, 9872.91] - - [1024, 4096, 1, 3463] - - [574, 9929.51] + - [578, 9929.51] - - [4096, 1024, 1, 3526] - - [574, 9876.8] + - [578, 9876.8] - - [4096, 1024, 1, 3154] - - [573, 9858.15] + - [577, 9858.15] - - [4096, 1024, 1, 3499] - - [574, 9862.82] + - [578, 9862.82] - - [1024, 1024, 1, 3939] - - [572, 9107.31] + - [576, 9107.31] - - [4096, 1024, 1, 3955] - - [574, 9906.18] + - [578, 9906.18] - - [1024, 4096, 1, 3297] - - [567, 9925.24] + - [571, 9925.24] - - [1024, 4096, 1, 3233] - - [573, 9920.55] + - [577, 9920.55] - - [1024, 4096, 1, 3226] - - [573, 9911.25] + - [577, 9911.25] - - [4096, 1024, 1, 3404] - - [573, 9867.18] + - [577, 9867.18] - - [4096, 1024, 1, 3355] - - [573, 9862.56] + - [577, 9862.56] - - [1024, 4096, 1, 3542] - - [573, 9926.39] + - [577, 9926.39] - - [4096, 1024, 1, 3181] - - [574, 9831.76] + - [578, 9831.76] - - [1024, 4096, 1, 3474] - - [573, 9927.93] + - [577, 9927.93] - - [4096, 1024, 1, 3319] - - [573, 9870.18] + - [577, 9870.18] - - [1024, 4096, 1, 3434] - - [566, 9917.41] + - [570, 9917.41] - - [1024, 4096, 1, 3860] - - [573, 9945.22] + - [577, 9945.22] - - [1024, 4096, 1, 3343] - - [566, 9914.56] + - [570, 9914.56] - - [64, 77, 816, 78] - - [544, 5276.87] + - [548, 5276.87] - - [1024, 4096, 1, 3488] - - [573, 9945.71] + - [577, 9945.71] - - [1024, 4096, 1, 3046] - - [573, 9908.68] + - [577, 9908.68] - - [1024, 4096, 1, 3141] - - [574, 9909.08] + - [578, 9909.08] - - [1024, 4096, 1, 3516] - - [574, 9911.28] + - [578, 9911.28] - - [4096, 1024, 1, 3147] - - [573, 9840.37] + - [577, 9840.37] - - [1024, 1024, 1, 4059] - - [565, 9009.68] + - [569, 9009.68] - - [1024, 1024, 1, 3944] - - [565, 9006.07] + - [569, 9006.07] - - [1024, 4096, 1, 3421] - - [574, 9919.76] + - [578, 9919.76] - - [4096, 1024, 1, 3944] - - [567, 9899.43] + - [571, 9899.43] - - [64, 45, 1424, 45] - - [552, 4068.57] + - [556, 4068.57] - - [1024, 4096, 1, 3574] - - [567, 9930.09] + - [571, 9930.09] - - [1024, 4096, 1, 3977] - - [566, 9944.18] + - [570, 9944.18] - - [1024, 1024, 1, 3968] - - [571, 9045.12] + - [575, 9045.12] - - [1024, 4096, 1, 2985] - - [573, 9887.55] + - [577, 9887.55] - - [64, 193, 320, 193] - - [560, 6631.25] + - [564, 6631.25] - - [1024, 4096, 1, 3427] - - [574, 9933.31] + - [578, 9933.31] - - [64, 12, 5040, 12] - - [554, 1552.43] + - [558, 1552.43] - - [1024, 4096, 1, 3482] - - [574, 9942.12] + - [578, 9942.12] - - [1024, 4096, 1, 3332] - - [566, 9923.48] + - [570, 9923.48] - - [1024, 1024, 1, 3720] - - [570, 9039.46] + - [574, 9039.46] - - [4096, 1024, 1, 3308] - - [574, 9852.56] + - [578, 9852.56] - - [1024, 4096, 1, 3513] - - [574, 9919.89] + - [578, 9919.89] - - [1024, 4096, 1, 3154] - - [567, 9908.36] + - [571, 9908.36] - - [1024, 4096, 1, 3955] - - [574, 9949.91] + - [578, 9949.91] - - [1024, 4096, 1, 2967] - - [574, 9897.34] + - [578, 9897.34] - - [1024, 33708, 1, 3942] - - [566, 10336.0] + - [570, 10336.0] - - [1024, 4096, 1, 3319] - - [574, 9912.35] + - [578, 9912.35] - - [4096, 1024, 1, 3860] - - [573, 9909.19] + - [577, 9909.19] - - [1024, 4096, 1, 3548] - - [566, 9924.11] + - [570, 9924.11] - - [4096, 1024, 1, 3977] - - [574, 9891.34] + - [578, 9891.34] - - [4096, 1024, 1, 3535] - - [573, 9867.74] + - [577, 9867.74] - - [1024, 4096, 1, 3541] - - [574, 9923.06] + - [578, 9923.06] - - [1024, 1024, 1, 3910] - - [571, 9080.3] + - [575, 9080.3] - - [1024, 33708, 1, 3584] - - [566, 10332.9] + - [570, 10332.9] - - [1024, 4096, 1, 3168] - - [567, 9926.17] + - [571, 9926.17] - - [1024, 4096, 1, 3448] - - [574, 9922.32] + - [578, 9922.32] - - [4096, 1024, 1, 3343] - - [573, 9857.13] + - [577, 9857.13] - - [64, 35, 1808, 35] - - [556, 3175.34] + - [560, 3175.34] - - [1024, 4096, 1, 3357] - - [567, 9902.31] + - [571, 9902.31] - - [64, 143, 432, 143] - - [557, 6489.6] + - [561, 6489.6] - - [4096, 1024, 1, 3510] - - [573, 9867.3] + - [577, 9867.3] - - [4096, 1024, 1, 3369] - - [573, 9863.34] + - [577, 9863.34] - - [64, 92, 688, 93] - - [544, 6188.2] + - [548, 6188.2] - - [4096, 1024, 1, 3379] - - [573, 9870.02] + - [577, 9870.02] - - [1024, 4096, 1, 3276] - - [573, 9904.67] + - [577, 9904.67] - - [1024, 4096, 1, 3363] - - [573, 9925.03] + - [577, 9925.03] - - [4096, 1024, 1, 3055] - - [573, 9831.82] + - [577, 9831.82] - - [1024, 4096, 1, 3524] - - [566, 9923.69] + - [570, 9923.69] - - [4096, 1024, 1, 3057] - - [573, 9852.77] + - [577, 9852.77] - - [1024, 33708, 1, 3720] - - [567, 10327.0] + - [571, 10327.0] - - [1024, 4096, 1, 3383] - - [566, 9919.29] + - [570, 9919.29] - - [1024, 4096, 1, 3522] - - [567, 9932.46] + - [571, 9932.46] - - [1024, 33708, 1, 3956] - - [566, 10333.7] + - [570, 10333.7] - - [1024, 4096, 1, 3481] - - [566, 9921.98] + - [570, 9921.98] - - [4096, 1024, 1, 3562] - - [574, 9874.76] + - [578, 9874.76] - - [4096, 1024, 1, 3299] - - [573, 9872.87] + - [577, 9872.87] - - [1024, 4096, 1, 3262] - - [567, 9924.73] + - [571, 9924.73] - - [1024, 4096, 1, 3840] - - [566, 9961.74] + - [570, 9961.74] - - [1024, 33708, 1, 4026] - - [566, 10334.2] + - [570, 10334.2] - - [4096, 1024, 1, 3168] - - [567, 9878.35] + - [571, 9878.35] - - [64, 101, 624, 101] - - [547, 5734.62] + - [551, 5734.62] - - [1024, 4096, 1, 3999] - - [566, 9947.0] + - [570, 9947.0] - - [1024, 4096, 1, 3549] - - [566, 9923.2] + - [570, 9923.2] - - [4096, 1024, 1, 3375] - - [573, 9868.79] + - [577, 9868.79] - - [1024, 4096, 1, 3496] - - [574, 9928.57] + - [578, 9928.57] - - [64, 29, 2176, 29] - - [543, 3289.92] + - [547, 3289.92] - - [1024, 4096, 1, 3190] - - [574, 9897.51] + - [578, 9897.51] - - [4096, 1024, 1, 3273] - - [574, 9853.55] + - [578, 9853.55] - - [1024, 4096, 1, 3406] - - [573, 9906.94] + - [577, 9906.94] - - [4096, 1024, 1, 4005] - - [566, 9907.87] + - [570, 9907.87] - - [4096, 1024, 1, 3555] - - [573, 9878.86] + - [577, 9878.86] - - [4096, 1024, 1, 2505] - - [573, 9785.0] + - [577, 9785.0] - - [1024, 4096, 1, 3460] - - [573, 9930.14] + - [577, 9930.14] - - [64, 17, 3632, 17] - - [544, 1917.17] + - [548, 1917.17] - - [1024, 4096, 1, 3579] - - [567, 9920.84] + - [571, 9920.84] - - [1024, 33708, 1, 4030] - - [567, 10327.6] + - [571, 10327.6] - - [1024, 4096, 1, 3510] - - [567, 9931.21] + - [571, 9931.21] - - [1024, 1024, 1, 3969] - - [564, 9020.73] + - [568, 9020.73] - - [1024, 4096, 1, 3282] - - [574, 9919.95] + - [578, 9919.95] - - [1024, 4096, 1, 3377] - - [566, 9927.24] + - [570, 9927.24] - - [1024, 4096, 1, 2935] - - [574, 9903.38] + - [578, 9903.38] - - [64, 41, 1552, 41] - - [544, 3740.38] + - [548, 3740.38] - - [1024, 4096, 1, 3498] - - [566, 9914.91] + - [570, 9914.91] - - [1024, 4096, 1, 3593] - - [573, 9925.54] + - [577, 9925.54] - - [1024, 1024, 1, 3948] - - [572, 9008.93] + - [576, 9008.93] - - [4096, 1024, 1, 3226] - - [574, 9854.65] + - [578, 9854.65] - - [1024, 4096, 1, 2499] - - [573, 9904.72] + - [577, 9904.72] - - [1024, 4096, 1, 3296] - - [566, 9926.79] + - [570, 9926.79] - - [1024, 4096, 1, 3455] - - [573, 9917.42] + - [577, 9917.42] - - [1024, 4096, 1, 3399] - - [567, 9919.6] + - [571, 9919.6] - - [1024, 4096, 1, 3205] - - [566, 9917.64] + - [570, 9917.64] - - [4096, 1024, 1, 4026] - - [574, 9897.71] + - [578, 9897.71] - - [1024, 4096, 1, 3484] - - [566, 9915.43] + - [570, 9915.43] - - [4096, 1024, 1, 3302] - - [574, 9862.7] + - [578, 9862.7] - - [1024, 4096, 1, 3485] - - [574, 9912.9] + - [578, 9912.9] - - [1024, 1024, 1, 3996] - - [572, 9008.67] + - [576, 9008.67] - - [1024, 4096, 1, 3126] - - [567, 9910.06] + - [571, 9910.06] - - [1024, 4096, 1, 4050] - - [566, 9951.11] + - [570, 9951.11] - - [4096, 1024, 1, 3235] - - [567, 9870.64] + - [571, 9870.64] - - [1024, 33708, 1, 3955] - - [566, 10336.0] + - [570, 10336.0] - - [1024, 4096, 1, 3342] - - [566, 9903.75] + - [570, 9903.75] - - [1024, 1024, 1, 3900] - - [571, 9082.82] + - [575, 9082.82] - - [1024, 4096, 1, 3397] - - [574, 9922.6] + - [578, 9922.6] - - [4096, 1024, 1, 3491] - - [574, 9880.65] + - [578, 9880.65] - - [1024, 4096, 1, 3503] - - [566, 9923.18] + - [570, 9923.18] - - [1024, 4096, 1, 3140] - - [567, 9908.31] + - [571, 9908.31] - - [4096, 1024, 1, 3121] - - [573, 9860.22] + - [577, 9860.22] - - [4096, 1024, 1, 3276] - - [573, 9854.09] + - [577, 9854.09] - - [1024, 4096, 1, 3321] - - [574, 9917.76] + - [578, 9917.76] - - [1024, 4096, 1, 3870] - - [574, 9930.97] + - [578, 9930.97] - - [4096, 1024, 1, 3475] - - [573, 9877.48] + - [577, 9877.48] - - [1024, 4096, 1, 2984] - - [573, 9895.49] + - [577, 9895.49] - - [4096, 1024, 1, 3363] - - [567, 9873.34] + - [571, 9873.34] - - [1024, 4096, 1, 3582] - - [573, 9920.77] + - [577, 9920.77] - - [4096, 1024, 1, 3509] - - [573, 9886.76] + - [577, 9886.76] - - [1024, 4096, 1, 3426] - - [566, 9928.76] + - [570, 9928.76] - - [4096, 1024, 1, 3136] - - [573, 9872.51] + - [577, 9872.51] - - [1024, 4096, 1, 3232] - - [574, 9926.19] + - [578, 9926.19] - - [4096, 1024, 1, 3103] - - [573, 9838.93] + - [577, 9838.93] - - [1024, 4096, 1, 3335] - - [567, 9913.27] + - [571, 9913.27] - - [1024, 4096, 1, 3900] - - [566, 9937.91] + - [570, 9937.91] - - [4096, 1024, 1, 3512] - - [567, 9877.16] + - [571, 9877.16] - - [4096, 1024, 1, 3222] - - [573, 9859.67] + - [577, 9859.67] - - [1024, 4096, 1, 3165] - - [573, 9899.61] + - [577, 9899.61] - - [4096, 1024, 1, 3408] - - [573, 9899.58] + - [577, 9899.58] - - [4096, 1024, 1, 3751] - - [573, 9891.39] + - [577, 9891.39] - - [1024, 4096, 1, 3318] - - [566, 9913.32] + - [570, 9913.32] - - [4096, 1024, 1, 3442] - - [574, 9880.11] + - [578, 9880.11] - - [1024, 4096, 1, 3413] - - [573, 9921.8] + - [577, 9921.8] - - [4096, 1024, 1, 3524] - - [573, 9879.12] + - [577, 9879.12] - - [1024, 4096, 1, 3976] - - [574, 9945.47] + - [578, 9945.47] - - [1024, 4096, 1, 3475] - - [574, 9932.41] + - [578, 9932.41] - - [1024, 4096, 1, 3534] - - [566, 9911.39] + - [570, 9911.39] - - [4096, 1024, 1, 3301] - - [573, 9872.65] + - [577, 9872.65] - - [4096, 1024, 1, 3248] - - [573, 9878.12] + - [577, 9878.12] - - [1024, 4096, 1, 2977] - - [567, 9899.83] + - [571, 9899.83] - - [4096, 1024, 1, 3346] - - [573, 9875.97] + - [577, 9875.97] - - [1024, 4096, 1, 3451] - - [566, 9920.06] + - [570, 9920.06] - - [1024, 4096, 1, 3257] - - [567, 9904.92] + - [571, 9904.92] - - [1024, 1024, 1, 3640] - - [565, 8983.29] + - [569, 8983.29] - - [1024, 4096, 1, 3356] - - [566, 9904.38] + - [570, 9904.38] - - [4096, 1024, 1, 3348] - - [574, 9872.43] + - [578, 9872.43] - - [4096, 1024, 1, 3335] - - [573, 9865.72] + - [577, 9865.72] - - [4096, 1024, 1, 3505] - - [573, 9888.78] + - [577, 9888.78] - - [1024, 4096, 1, 3490] - - [566, 9937.9] + - [570, 9937.9] - - [4096, 1024, 1, 3447] - - [573, 9865.29] + - [577, 9865.29] - - [1024, 4096, 1, 3267] - - [574, 9919.22] + - [578, 9919.22] - - [4096, 1024, 1, 3230] - - [573, 9853.1] + - [577, 9853.1] - - [4096, 1024, 1, 3455] - - [573, 9862.34] + - [577, 9862.34] - - [1024, 4096, 1, 3925] - - [566, 9945.54] + - [570, 9945.54] - - [1024, 4096, 1, 3362] - - [567, 9921.53] + - [571, 9921.53] - - [4096, 1024, 1, 3969] - - [574, 9911.88] + - [578, 9911.88] - - [4096, 1024, 1, 3527] - - [573, 9882.77] + - [577, 9882.77] - - [1024, 4096, 1, 3585] - - [567, 9946.42] + - [571, 9946.42] - - [4096, 1024, 1, 3063] - - [573, 9853.93] + - [577, 9853.93] - - [4096, 1024, 1, 3435] - - [573, 9867.03] + - [577, 9867.03] - - [4096, 1024, 1, 3366] - - [574, 9863.92] + - [578, 9863.92] - - [4096, 1024, 1, 3581] - - [566, 9868.47] + - [570, 9868.47] - - [1024, 33708, 1, 3906] - - [566, 10339.2] + - [570, 10339.2] - - [1024, 4096, 1, 3464] - - [574, 9916.11] + - [578, 9916.11] - - [1024, 4096, 1, 3440] - - [573, 9945.15] + - [577, 9945.15] - - [4096, 1024, 1, 3143] - - [573, 9846.66] + - [577, 9846.66] - - [1024, 4096, 1, 3349] - - [567, 9912.73] + - [571, 9912.73] - - [4096, 1024, 1, 3416] - - [573, 9885.03] + - [577, 9885.03] - - [4096, 1024, 1, 3365] - - [573, 9875.9] + - [577, 9875.9] - - [1024, 4096, 1, 3470] - - [574, 9914.88] + - [578, 9914.88] - - [4096, 1024, 1, 3287] - - [573, 9860.59] + - [577, 9860.59] - - [1024, 4096, 1, 3441] - - [574, 9928.88] + - [578, 9928.88] - - [4096, 1024, 1, 3224] - - [573, 9857.73] + - [577, 9857.73] - - [1024, 4096, 1, 3387] - - [566, 9911.62] + - [570, 9911.62] - - [1024, 4096, 1, 3547] - - [566, 9920.26] + - [570, 9920.26] - - [4096, 1024, 1, 3478] - - [567, 9882.8] + - [571, 9882.8] - - [4096, 1024, 1, 3548] - - [574, 9869.35] + - [578, 9869.35] - - [1024, 33708, 1, 4020] - - [566, 10345.2] + - [570, 10345.2] - - [4096, 1024, 1, 3320] - - [573, 9863.64] + - [577, 9863.64] - - [1024, 4096, 1, 3906] - - [573, 9942.57] + - [577, 9942.57] - - [4096, 1024, 1, 3796] - - [573, 9899.03] + - [577, 9899.03] - - [1024, 4096, 1, 3306] - - [566, 9902.3] + - [570, 9902.3] - - [1024, 4096, 1, 3401] - - [574, 9913.85] + - [578, 9913.85] - - [64, 147, 432, 147] - - [557, 6626.5] + - [561, 6626.5] - - [1024, 4096, 1, 3215] - - [574, 9911.14] + - [578, 9911.14] - - [4096, 1024, 1, 4012] - - [574, 9898.1] + - [578, 9898.1] - - [1024, 4096, 1, 2765] - - [574, 9863.63] + - [578, 9863.63] - - [4096, 1024, 1, 3554] - - [567, 9883.42] + - [571, 9883.42] - - [4096, 1024, 1, 3423] - - [573, 9866.62] + - [577, 9866.62] - - [1024, 1024, 1, 3751] - - [571, 9006.26] + - [575, 9006.26] - - [1024, 4096, 1, 3562] - - [567, 9921.98] + - [571, 9921.98] - - [1024, 4096, 1, 3489] - - [566, 9936.68] + - [570, 9936.68] - - [4096, 1024, 1, 3358] - - [573, 9858.12] + - [577, 9858.12] - - [4096, 1024, 1, 3270] - - [574, 9850.74] + - [578, 9850.74] - - [1024, 4096, 1, 3293] - - [566, 9905.23] + - [570, 9905.23] - - [1024, 4096, 1, 3376] - - [566, 9934.88] + - [570, 9934.88] - - [4096, 1024, 1, 3245] - - [573, 9852.42] + - [577, 9852.42] - - [4096, 1024, 1, 3541] - - [573, 9887.12] + - [577, 9887.12] - - [4096, 1024, 1, 3443] - - [573, 9871.63] + - [577, 9871.63] - - [4096, 1024, 1, 3438] - - [574, 9863.76] + - [578, 9863.76] - - [4096, 1024, 1, 3244] - - [573, 9859.66] + - [577, 9859.66] - - [1024, 4096, 1, 3365] - - [573, 9922.0] + - [577, 9922.0] - - [1024, 4096, 1, 3299] - - [567, 9923.28] + - [571, 9923.28] - - [4096, 1024, 1, 3840] - - [573, 9914.65] + - [577, 9914.65] - - [1024, 4096, 1, 3471] - - [574, 9918.28] + - [578, 9918.28] - - [1024, 4096, 1, 3398] - - [566, 9918.89] + - [570, 9918.89] - - [4096, 1024, 1, 3162] - - [573, 9843.83] + - [577, 9843.83] - - [1024, 4096, 1, 4005] - - [567, 9947.77] + - [571, 9947.77] - - [4096, 1024, 1, 3579] - - [573, 9868.15] + - [577, 9868.15] - - [64, 18, 3440, 18] - - [549, 2059.23] + - [553, 2059.23] - - [64, 177, 352, 177] - - [568, 7315.3] + - [572, 7315.3] - - [1024, 4096, 1, 3121] - - [574, 9930.24] + - [578, 9930.24] - - [4096, 1024, 1, 3441] - - [573, 9883.18] + - [577, 9883.18] - - [4096, 1024, 1, 3422] - - [573, 9858.31] + - [577, 9858.31] - - [4096, 1024, 1, 3444] - - [573, 9886.93] + - [577, 9886.93] - - [1024, 4096, 1, 3337] - - [567, 9911.35] + - [571, 9911.35] - - [4096, 1024, 1, 3550] - - [566, 9871.77] + - [570, 9871.77] - - [1024, 4096, 1, 3477] - - [566, 9930.55] + - [570, 9930.55] - - [4096, 1024, 1, 3490] - - [573, 9878.35] + - [577, 9878.35] - - [4096, 1024, 1, 3585] - - [573, 9893.53] + - [577, 9893.53] - - [1024, 4096, 1, 3143] - - [566, 9901.09] + - [570, 9901.09] - - [1024, 33708, 1, 3876] - - [567, 10330.7] + - [571, 10330.7] - - [1024, 4096, 1, 3320] - - [574, 9913.08] + - [578, 9913.08] - - [1024, 4096, 1, 3423] - - [574, 9914.04] + - [578, 9914.04] - - [1024, 4096, 1, 3894] - - [566, 9944.37] + - [570, 9944.37] - - [4096, 1024, 1, 3410] - - [573, 9878.57] + - [577, 9878.57] - - [1024, 4096, 1, 3561] - - [566, 9926.58] + - [570, 9926.58] - - [4096, 1024, 1, 3492] - - [567, 9872.82] + - [571, 9872.82] - - [64, 85, 752, 85] - - [544, 5734.25] + - [548, 5734.25] - - [36548, 1024, 1, 3712] - - [576, 10367.5] + - [580, 10367.5] - - [4096, 2048, 1, 128] - - [577, 8743.83] + - [581, 8743.83] - - [1024, 1024, 1, 3712] - - [578, 9976.19] + - [582, 9976.19] - - [1024, 1024, 1, 128] - - [575, 5765.37] + - [579, 5765.37] - - [4096, 3072, 1, 128] - - [577, 8869.01] + - [581, 8869.01] - - [768, 3072, 1, 4096] - - [589, 10028.7] + - [593, 10028.7] - - [64, 256, 192, 256] - - [583, 8791.55] + - [587, 8791.55] - - [768, 2, 1, 16] - - [586, 4.95484] + - [590, 4.95484] - - [768, 768, 1, 64] - - [582, 3469.55] + - [586, 3469.55] - - [768, 768, 1, 4096] - - [590, 7475.0] + - [594, 7475.0] - - [768, 30522, 1, 1280] - - [593, 10296.9] + - [597, 10296.9] - - [64, 128, 384, 128] - - [583, 7660.83] + - [587, 7660.83] - - [768, 30522, 1, 320] - - [591, 10007.9] + - [595, 10007.9] - - [768, 768, 1, 32] - - [580, 2359.3] + - [584, 2359.3] - - [3072, 768, 1, 4096] - - [589, 10033.7] + - [593, 10033.7] - - [768, 30522, 1, 640] - - [592, 10206.7] + - [596, 10206.7] - - [64, 64, 768, 64] - - [581, 5494.72] + - [585, 5494.72] - - [768, 768, 1, 640] - - [590, 6721.64] + - [594, 6721.64] - - [768, 768, 1, 16] - - [579, 1203.72] + - [583, 1203.72] - - [768, 768, 1, 1280] - - [588, 7138.57] + - [592, 7138.57] - - [768, 2, 1, 32] - - [584, 11.8154] + - [588, 11.8154] - - [2048, 2048, 1, 512] - - [604, 9607.57] + - [608, 9607.57] - - [512, 32, 1, 200] - - [597, 422.268] + - [601, 422.268] - - [1024, 1, 1, 200] - - [600, 24.6154] + - [604, 24.6154] - - [1600, 1024, 1, 512] - - [595, 8115.91] + - [599, 8115.91] - - [560, 1024, 1, 200] - - [594, 4810.74] + - [598, 4810.74] - - [1024, 1024, 1, 512] - - [603, 8614.74] + - [607, 8614.74] - - [2048, 1, 1, 512] - - [598, 80.9086] + - [602, 80.9086] - - [512, 512, 1, 200] - - [596, 4398.39] + - [600, 4398.39] - - [100, 2048, 1, 512] - - [601, 4443.12] + - [605, 4443.12] - - [1024, 1024, 1, 200] - - [602, 6990.51] + - [606, 6990.51] - - [1024, 64, 1, 512] - - [599, 2853.27] + - [603, 2853.27] - - [1024, 256, 1, 18944] - - [623, 9196.41] + - [627, 9196.41] - - [256, 3328, 1, 8976] - - [613, 8299.26] + - [617, 8299.26] - - [1024, 256, 1, 4352] - - [621, 8813.74] + - [625, 8813.74] - - [256, 9728, 1, 8976] - - [616, 9638.48] + - [620, 9638.48] - - [1024, 256, 1, 3072] - - [623, 8640.63] + - [627, 8640.63] - - [768, 2048, 1, 256] - - [615, 8662.93] + - [619, 8662.93] - - [1024, 256, 1, 19968] - - [620, 9220.86] + - [624, 9220.86] - - [256, 12800, 1, 8976] - - [610, 9418.42] + - [614, 9418.42] - - [1024, 256, 1, 3328] - - [624, 8682.48] + - [628, 8682.48] - - [256, 10240, 1, 8976] - - [617, 10137.7] + - [621, 10137.7] - - [1024, 256, 1, 15104] - - [622, 9167.03] + - [626, 9167.03] - - [256, 10496, 1, 8976] - - [610, 9858.38] + - [614, 9858.38] - - [1024, 256, 1, 2816] - - [625, 8575.71] + - [629, 8575.71] - - [1024, 256, 1, 4608] - - [620, 8861.21] + - [624, 8861.21] - - [256, 11264, 1, 8976] - - [607, 9627.69] + - [611, 9627.69] - - [1024, 256, 1, 6400] - - [620, 8985.23] + - [624, 8985.23] - - [1024, 256, 1, 16128] - - [620, 9170.26] + - [624, 9170.26] - - [256, 44505, 1, 8976] - - [614, 10331.8] + - [618, 10331.8] - - [256, 6144, 1, 8976] - - [617, 10395.0] + - [621, 10395.0] - - [1024, 256, 1, 5120] - - [622, 8881.53] + - [626, 8881.53] - - [1024, 256, 1, 7936] - - [625, 9023.14] + - [629, 9023.14] - - [256, 3840, 1, 8976] - - [612, 9541.28] + - [616, 9541.28] - - [1024, 256, 1, 21248] - - [620, 9209.72] + - [624, 9209.72] - - [1024, 256, 1, 12032] - - [622, 9156.17] + - [626, 9156.17] - - [256, 8192, 1, 8976] - - [619, 10374.4] + - [623, 10374.4] - - [1024, 256, 1, 3584] - - [621, 8712.2] + - [625, 8712.2] - - [1024, 256, 1, 14336] - - [622, 9162.51] + - [626, 9162.51] - - [256, 7168, 1, 8976] - - [608, 9554.86] + - [612, 9554.86] - - [1024, 256, 1, 13568] - - [620, 9165.04] + - [624, 9165.04] - - [256, 4096, 1, 8976] - - [612, 10146.6] + - [616, 10146.6] - - [1024, 256, 1, 4096] - - [621, 8783.88] + - [625, 8783.88] - - [256, 2560, 1, 8976] - - [611, 8381.56] + - [615, 8381.56] - - [256, 20992, 1, 8976] - - [610, 9989.86] + - [614, 9989.86] - - [256, 4352, 1, 8976] - - [611, 9634.92] + - [615, 9634.92] - - [256, 33536, 1, 8976] - - [610, 10218.1] + - [614, 10218.1] - - [256, 3584, 1, 8976] - - [612, 8924.5] + - [616, 8924.5] - - [256, 26112, 1, 8976] - - [611, 10272.3] + - [615, 10272.3] - - [256, 14336, 1, 8976] - - [615, 10217.3] + - [619, 10217.3] - - [1024, 256, 1, 14848] - - [622, 9185.19] + - [626, 9185.19] - - [1024, 256, 1, 8448] - - [623, 9025.89] + - [627, 9025.89] - - [1024, 256, 1, 28672] - - [620, 9256.4] + - [624, 9256.4] - - [1024, 256, 1, 5632] - - [620, 8932.69] + - [624, 8932.69] - - [256, 22016, 1, 8976] - - [615, 10151.9] + - [619, 10151.9] - - [1024, 256, 1, 33536] - - [620, 9243.07] + - [624, 9243.07] - - [256, 5120, 1, 8976] - - [606, 9418.05] + - [610, 9418.05] - - [256, 11520, 1, 8976] - - [613, 9701.0] + - [617, 9701.0] - - [256, 19968, 1, 8976] - - [611, 10228.0] + - [615, 10228.0] - - [1024, 256, 1, 5376] - - [622, 8892.52] + - [626, 8892.52] - - [1024, 256, 1, 22016] - - [620, 9244.24] + - [624, 9244.24] - - [256, 8960, 1, 8976] - - [611, 9841.31] + - [615, 9841.31] - - [1024, 256, 1, 15872] - - [620, 9223.15] + - [624, 9223.15] - - [256, 17408, 1, 8976] - - [615, 9785.77] + - [619, 9785.77] - - [256, 5632, 1, 8976] - - [615, 9564.22] + - [619, 9564.22] - - [256, 32512, 1, 8976] - - [614, 10357.9] + - [618, 10357.9] - - [256, 11008, 1, 8976] - - [607, 9445.13] + - [611, 9445.13] - - [1024, 256, 1, 6144] - - [622, 8955.81] + - [626, 8955.81] - - [256, 4864, 1, 8976] - - [607, 8979.35] + - [611, 8979.35] - - [256, 15104, 1, 8976] - - [610, 10007.0] + - [614, 10007.0] - - [1024, 256, 1, 9984] - - [620, 9110.43] + - [624, 9110.43] - - [256, 1280, 1, 8976] - - [606, 5944.34] + - [610, 5944.34] - - [1024, 256, 1, 1024] - - [622, 7005.1] + - [626, 7005.1] - - [1024, 256, 1, 9728] - - [622, 9066.19] + - [626, 9066.19] - - [1024, 256, 1, 10496] - - [620, 9118.05] + - [624, 9118.05] - - [256, 11776, 1, 8976] - - [617, 9911.64] + - [621, 9911.64] - - [256, 12544, 1, 8976] - - [610, 9235.25] + - [614, 9235.25] - - [1024, 256, 1, 17152] - - [620, 9152.21] + - [624, 9152.21] - - [1024, 256, 1, 11520] - - [622, 9146.77] + - [626, 9146.77] - - [1024, 256, 1, 21504] - - [622, 9207.42] + - [626, 9207.42] - - [256, 17152, 1, 8976] - - [609, 9654.71] + - [613, 9654.71] - - [1024, 256, 1, 17408] - - [620, 9181.17] + - [624, 9181.17] - - [256, 15872, 1, 8976] - - [618, 10086.4] + - [622, 10086.4] - - [256, 18688, 1, 8976] - - [611, 9612.47] + - [615, 9612.47] - - [256, 5888, 1, 8976] - - [615, 9988.33] + - [619, 9988.33] - - [512, 2048, 1, 256] - - [605, 7678.36] + - [609, 7678.36] - - [1024, 256, 1, 7680] - - [623, 9032.96] + - [627, 9032.96] - - [1024, 256, 1, 1280] - - [625, 7767.23] + - [629, 7767.23] - - [256, 14848, 1, 8976] - - [611, 9852.66] + - [615, 9852.66] - - [256, 9984, 1, 8976] - - [617, 9908.87] + - [621, 9908.87] - - [256, 20480, 1, 8976] - - [615, 10337.1] + - [619, 10337.1] - - [1024, 256, 1, 8192] - - [622, 9044.32] + - [626, 9044.32] - - [1024, 256, 1, 19712] - - [621, 9184.18] + - [625, 9184.18] - - [256, 13568, 1, 8976] - - [611, 9927.82] + - [615, 9927.82] - - [256, 13312, 1, 8976] - - [610, 9757.91] + - [614, 9757.91] - - [256, 2816, 1, 8976] - - [610, 9191.43] + - [614, 9191.43] - - [1024, 256, 1, 2304] - - [621, 8444.91] + - [625, 8444.91] - - [256, 21248, 1, 8976] - - [611, 10127.5] + - [615, 10127.5] - - [256, 16128, 1, 8976] - - [619, 10238.4] + - [623, 10238.4] - - [256, 512, 36, 98] - - [642, 7994.85] + - [646, 7994.85] - - [64, 192, 36, 25088] - - [711, 8613.89] + - [715, 8613.89] - - [128, 128, 64, 25] - - [641, 2540.15] + - [645, 2540.15] - - [256, 256, 64, 56] - - [642, 6924.56] + - [646, 6924.56] - - [512, 486, 36, 800] - - [649, 8994.84] + - [653, 8994.84] - - [512, 512, 36, 1568] - - [660, 9872.38] + - [664, 9872.38] - - [64, 192, 64, 3200] - - [705, 9295.89] + - [709, 9295.89] - - [256, 384, 36, 4096] - - [705, 9334.61] + - [709, 9334.61] - - [128, 256, 64, 32] - - [644, 4279.9] + - [648, 4279.9] - - [64, 128, 64, 23104] - - [711, 10103.1] + - [715, 10103.1] - - [128, 256, 64, 9] - - [635, 1709.63] + - [639, 1709.63] - - [256, 512, 36, 784] - - [645, 9520.73] + - [649, 9520.73] - - [256, 324, 36, 32] - - [683, 4473.38] + - [687, 4473.38] - - [512, 512, 36, 33] - - [654, 5925.17] + - [658, 5925.17] - - [16, 32, 36, 5760] - - [658, 1448.8] + - [662, 1448.8] - - [192, 384, 64, 128] - - [705, 8618.43] + - [709, 8618.43] - - [512, 512, 64, 72] - - [661, 8260.12] + - [665, 8260.12] - - [128, 128, 64, 1600] - - [634, 9008.38] + - [638, 9008.38] - - [512, 512, 36, 128] - - [705, 8871.62] + - [709, 8871.62] - - [192, 384, 64, 2304] - - [634, 9657.16] + - [638, 9657.16] - - [384, 256, 64, 450] - - [670, 9538.93] + - [674, 9538.93] - - [3, 64, 36, 6272] - - [658, 509.784] + - [662, 509.784] - - [3, 64, 64, 2888] - - [687, 708.621] + - [691, 708.621] - - [384, 256, 64, 2304] - - [670, 10287.5] + - [674, 10287.5] - - [512, 512, 64, 144] - - [705, 9226.7] + - [709, 9226.7] - - [256, 256, 36, 6272] - - [645, 9607.28] + - [649, 9607.28] - - [80, 192, 64, 4608] - - [706, 7347.93] + - [710, 7347.93] - - [64, 64, 36, 3136] - - [693, 5959.05] + - [697, 5959.05] - - [256, 384, 64, 2304] - - [670, 10283.4] + - [674, 10283.4] - - [512, 512, 36, 66] - - [654, 7618.08] + - [658, 7618.08] - - [128, 256, 64, 800] - - [680, 9611.15] + - [684, 9611.15] - - [64, 128, 36, 30] - - [636, 1242.61] + - [640, 1242.61] - - [192, 256, 36, 512] - - [705, 8657.97] + - [709, 8657.97] - - [256, 512, 64, 200] - - [705, 9153.87] + - [709, 9153.87] - - [256, 512, 64, 25] - - [683, 5349.88] + - [687, 5349.88] - - [3, 64, 64, 46208] - - [686, 808.562] + - [690, 808.562] - - [128, 256, 36, 1568] - - [678, 8528.62] + - [682, 8528.62] - - [64, 128, 64, 11552] - - [711, 9997.0] + - [715, 9997.0] - - [128, 192, 64, 946] - - [705, 9198.38] + - [709, 9198.38] - - [64, 192, 64, 12800] - - [666, 9000.66] + - [670, 9000.66] - - [224, 224, 64, 128] - - [643, 6312.07] + - [647, 6312.07] - - [128, 256, 64, 288] - - [705, 8697.87] + - [709, 8697.87] - - [64, 64, 64, 826] - - [648, 6650.21] + - [652, 6650.21] - - [256, 384, 64, 1152] - - [680, 10106.8] + - [684, 10106.8] - - [3, 64, 64, 92416] - - [686, 812.031] + - [690, 812.031] - - [32, 32, 36, 43808] - - [627, 2813.09] + - [631, 2813.09] - - [160, 320, 64, 288] - - [637, 8090.86] + - [641, 8090.86] - - [1, 16, 36, 23040] - - [674, 42.6667] + - [678, 42.6667] - - [128, 256, 36, 128] - - [652, 6049.48] + - [656, 6049.48] - - [128, 128, 64, 3360] - - [705, 9199.96] + - [709, 9199.96] - - [128, 128, 64, 420] - - [705, 8131.5] + - [709, 8131.5] - - [64, 128, 64, 361] - - [642, 6937.98] + - [646, 6937.98] - - [512, 512, 36, 16] - - [698, 3797.66] + - [702, 3797.66] - - [384, 256, 36, 800] - - [639, 9151.65] + - [643, 9151.65] - - [192, 384, 36, 4096] - - [639, 8867.57] + - [643, 8867.57] - - [64, 64, 64, 1600] - - [691, 7931.74] + - [695, 7931.74] - - [256, 384, 64, 576] - - [671, 9745.8] + - [675, 9745.8] - - [512, 512, 64, 14] - - [654, 3638.18] + - [658, 3638.18] - - [512, 512, 36, 8] - - [629, 2279.51] + - [633, 2279.51] - - [512, 486, 64, 128] - - [645, 8337.83] + - [649, 8337.83] - - [1, 16, 64, 640] - - [679, 49.9512] + - [683, 49.9512] - - [64, 96, 64, 288] - - [704, 5707.97] + - [708, 5707.97] - - [96, 96, 36, 1568] - - [673, 6866.75] + - [677, 6866.75] - - [256, 256, 36, 128] - - [677, 7703.82] + - [681, 7703.82] - - [64, 128, 36, 53824] - - [665, 6331.31] + - [669, 6331.31] - - [256, 256, 36, 32] - - [661, 4648.86] + - [665, 4648.86] - - [192, 256, 64, 288] - - [705, 8987.79] + - [709, 8987.79] - - [256, 256, 36, 16] - - [675, 2912.71] + - [679, 2912.71] - - [128, 256, 36, 3200] - - [678, 8680.27] + - [682, 8680.27] - - [160, 320, 64, 512] - - [637, 8449.44] + - [641, 8449.44] - - [128, 160, 36, 512] - - [648, 7214.97] + - [652, 7214.97] - - [96, 96, 36, 2592] - - [643, 7104.79] + - [647, 7104.79] - - [64, 96, 64, 800] - - [673, 7268.32] + - [677, 7268.32] - - [147, 64, 36, 18816] - - [689, 7116.26] + - [693, 7116.26] - - [160, 320, 36, 512] - - [643, 7874.82] + - [647, 7874.82] - - [256, 512, 36, 4] - - [682, 1034.78] + - [686, 1034.78] - - [96, 128, 64, 946] - - [665, 7901.07] + - [669, 7901.07] - - [256, 324, 64, 1568] - - [670, 8589.53] + - [674, 8589.53] - - [128, 128, 64, 50] - - [661, 4070.56] + - [665, 4070.56] - - [35, 96, 36, 8960] - - [655, 4207.3] + - [659, 4207.3] - - [32, 64, 36, 43808] - - [696, 4390.81] + - [700, 4390.81] - - [160, 224, 36, 128] - - [643, 5446.92] + - [647, 5446.92] - - [64, 64, 64, 81] - - [668, 2391.18] + - [672, 2391.18] - - [256, 256, 36, 3200] - - [634, 9559.55] + - [638, 9559.55] - - [256, 256, 36, 210] - - [645, 8414.61] + - [649, 8414.61] - - [192, 384, 64, 576] - - [705, 9468.75] + - [709, 9468.75] - - [512, 512, 64, 800] - - [680, 10096.4] + - [684, 10096.4] - - [512, 24, 36, 800] - - [631, 4761.77] + - [635, 4761.77] - - [64, 64, 64, 13216] - - [692, 8491.41] + - [696, 8491.41] - - [192, 224, 64, 1152] - - [648, 8769.06] + - [652, 8769.06] - - [256, 256, 64, 1152] - - [670, 9988.09] + - [674, 9988.09] - - [512, 486, 64, 512] - - [680, 9254.67] + - [684, 9254.67] - - [128, 128, 36, 784] - - [643, 7468.06] + - [647, 7468.06] - - [256, 512, 64, 1600] - - [667, 10232.5] + - [671, 10232.5] - - [512, 512, 64, 9] - - [661, 2599.78] + - [665, 2599.78] - - [96, 128, 64, 288] - - [673, 6599.43] + - [677, 6599.43] - - [64, 96, 36, 512] - - [673, 5073.75] + - [677, 5073.75] - - [256, 512, 36, 1568] - - [705, 9637.81] + - [709, 9637.81] - - [128, 128, 64, 400] - - [705, 8192.0] + - [709, 8192.0] - - [128, 128, 64, 800] - - [705, 8716.34] + - [709, 8716.34] - - [96, 128, 36, 512] - - [693, 6756.93] + - [697, 6756.93] - - [16, 32, 36, 360] - - [656, 754.036] + - [660, 754.036] - - [128, 256, 64, 3200] - - [670, 10222.5] + - [674, 10222.5] - - [96, 128, 64, 800] - - [673, 7967.9] + - [677, 7967.9] - - [256, 512, 64, 4] - - [635, 1097.99] + - [639, 1097.99] - - [256, 256, 64, 450] - - [680, 9347.45] + - [684, 9347.45] - - [64, 64, 64, 3200] - - [691, 8518.08] + - [695, 8518.08] - - [192, 224, 64, 128] - - [651, 7035.17] + - [655, 7035.17] - - [128, 128, 64, 288] - - [705, 7751.28] + - [709, 7751.28] - - [256, 256, 64, 72] - - [661, 7489.83] + - [665, 7489.83] - - [96, 208, 36, 512] - - [673, 6939.11] + - [677, 6939.11] - - [128, 256, 36, 3136] - - [648, 8669.33] + - [652, 8669.33] - - [64, 64, 36, 3520] - - [643, 6007.47] + - [647, 6007.47] - - [64, 128, 36, 1568] - - [706, 6897.7] + - [710, 6897.7] - - [160, 320, 64, 242] - - [632, 7873.17] + - [636, 7873.17] - - [192, 192, 36, 512] - - [643, 7707.32] + - [647, 7707.32] - - [512, 512, 36, 512] - - [705, 9582.42] + - [709, 9582.42] - - [1, 16, 64, 10240] - - [657, 71.3511] + - [661, 71.3511] - - [128, 128, 36, 512] - - [643, 7149.38] + - [647, 7149.38] - - [512, 512, 36, 256] - - [634, 9384.4] + - [638, 9384.4] - - [512, 512, 36, 1024] - - [628, 9777.89] + - [632, 9777.89] - - [96, 208, 64, 1152] - - [706, 7850.9] + - [710, 7850.9] - - [128, 192, 64, 3200] - - [634, 9490.82] + - [638, 9490.82] - - [256, 256, 36, 4096] - - [639, 9585.46] + - [643, 9585.46] - - [160, 160, 64, 288] - - [673, 7299.8] + - [677, 7299.8] - - [256, 256, 64, 896] - - [670, 9850.33] + - [674, 9850.33] - - [128, 256, 64, 242] - - [705, 8391.38] + - [709, 8391.38] - - [128, 128, 36, 440] - - [648, 6274.72] + - [652, 6274.72] - - [96, 128, 36, 1568] - - [693, 7875.03] + - [697, 7875.03] - - [192, 384, 36, 1024] - - [639, 8715.72] + - [643, 8715.72] - - [64, 96, 36, 10368] - - [710, 7478.59] + - [714, 7478.59] - - [128, 256, 64, 100] - - [654, 7084.97] + - [658, 7084.97] - - [112, 224, 36, 2048] - - [647, 7555.92] + - [651, 7555.92] - - [384, 256, 64, 1152] - - [670, 10102.3] + - [674, 10102.3] - - [192, 384, 36, 128] - - [705, 7543.04] + - [709, 7543.04] - - [128, 128, 36, 7040] - - [678, 7600.6] + - [682, 7600.6] - - [128, 256, 64, 1568] - - [670, 10005.9] + - [674, 10005.9] - - [128, 128, 36, 1568] - - [662, 7848.3] + - [666, 7848.3] - - [128, 256, 64, 72] - - [685, 6553.6] + - [689, 6553.6] - - [256, 256, 36, 12544] - - [699, 9365.04] + - [703, 9365.04] - - [256, 256, 36, 105] - - [661, 7286.06] + - [665, 7286.06] - - [128, 256, 36, 392] - - [648, 7625.69] + - [652, 7625.69] - - [64, 64, 64, 5408] - - [691, 8882.67] + - [695, 8882.67] - - [3, 64, 36, 25088] - - [658, 528.942] + - [662, 528.942] - - [384, 256, 36, 1024] - - [705, 9182.75] + - [709, 9182.75] - - [35, 96, 36, 13440] - - [712, 4110.29] + - [716, 4110.29] - - [128, 256, 64, 1152] - - [670, 9804.87] + - [674, 9804.87] - - [256, 324, 64, 32] - - [683, 5043.63] + - [687, 5043.63] - - [160, 224, 64, 128] - - [697, 6046.15] + - [701, 6046.15] - - [192, 224, 36, 2592] - - [695, 8878.68] + - [699, 8878.68] - - [96, 96, 64, 1152] - - [673, 8035.45] + - [677, 8035.45] - - [32, 64, 36, 90] - - [630, 964.465] + - [634, 964.465] - - [64, 128, 64, 2888] - - [645, 9047.23] + - [649, 9047.23] - - [256, 384, 36, 800] - - [705, 9154.02] + - [709, 9154.02] - - [512, 512, 64, 4] - - [702, 1233.62] + - [706, 1233.62] - - [192, 320, 36, 128] - - [642, 7388.19] + - [646, 7388.19] - - [64, 128, 36, 480] - - [706, 5653.27] + - [710, 5653.27] - - [192, 384, 64, 242] - - [705, 9079.99] + - [709, 9079.99] - - [256, 486, 64, 32] - - [698, 5909.18] + - [702, 5909.18] - - [147, 64, 64, 9702] - - [707, 7319.69] + - [711, 7319.69] - - [512, 512, 64, 64] - - [641, 8179.02] + - [645, 8179.02] - - [64, 192, 64, 3698] - - [634, 9287.89] + - [638, 9287.89] - - [73, 192, 64, 10439] - - [665, 6668.02] + - [669, 6668.02] - - [1, 16, 36, 1440] - - [681, 33.4452] + - [685, 33.4452] - - [128, 256, 36, 512] - - [648, 7989.15] + - [652, 7989.15] - - [512, 512, 64, 576] - - [680, 9951.89] + - [684, 9951.89] - - [64, 64, 36, 12544] - - [696, 5872.77] + - [700, 5872.77] - - [128, 128, 36, 880] - - [693, 7597.26] + - [697, 7597.26] - - [192, 224, 36, 128] - - [651, 6451.2] + - [655, 6451.2] - - [64, 64, 64, 800] - - [691, 6916.73] + - [695, 6916.73] - - [64, 128, 36, 12544] - - [669, 6395.88] + - [673, 6395.88] - - [64, 64, 36, 1568] - - [643, 5536.66] + - [647, 5536.66] - - [160, 160, 36, 512] - - [643, 7345.26] + - [647, 7345.26] - - [512, 24, 64, 512] - - [633, 5242.88] + - [637, 5242.88] - - [3, 64, 36, 3136] - - [658, 475.352] + - [662, 475.352] - - [256, 256, 64, 9] - - [683, 2106.51] + - [687, 2106.51] - - [3, 64, 64, 11552] - - [686, 785.127] + - [690, 785.127] - - [128, 256, 36, 12544] - - [701, 8792.13] + - [705, 8792.13] - - [128, 128, 36, 3136] - - [662, 8098.46] + - [666, 8098.46] - - [256, 512, 36, 3136] - - [645, 9694.39] + - [649, 9694.39] - - [64, 64, 36, 196] - - [659, 2757.76] + - [663, 2757.76] - - [144, 288, 36, 512] - - [693, 7077.89] + - [697, 7077.89] - - [256, 24, 64, 32] - - [672, 1483.83] + - [676, 1483.83] - - [384, 384, 36, 800] - - [634, 9246.5] + - [638, 9246.5] - - [512, 512, 64, 1600] - - [680, 10277.3] + - [684, 10277.3] - - [112, 224, 36, 512] - - [648, 6744.78] + - [652, 6744.78] - - [128, 128, 36, 49] - - [654, 2716.29] + - [658, 2716.29] - - [512, 512, 36, 4] - - [682, 1156.52] + - [686, 1156.52] - - [35, 96, 64, 4235] - - [643, 4631.28] + - [647, 4631.28] - - [192, 384, 64, 450] - - [634, 9372.2] + - [638, 9372.2] - - [256, 256, 36, 1024] - - [705, 9346.64] + - [709, 9346.64] - - [112, 224, 64, 1152] - - [648, 7523.95] + - [652, 7523.95] - - [256, 512, 64, 400] - - [667, 9597.95] + - [671, 9597.95] - - [149, 32, 36, 19072] - - [712, 5811.8] + - [716, 5811.8] - - [128, 256, 36, 6272] - - [648, 8754.68] + - [652, 8754.68] - - [128, 192, 36, 1568] - - [673, 8195.1] + - [677, 8195.1] - - [256, 256, 36, 512] - - [705, 9074.22] + - [709, 9074.22] - - [256, 256, 64, 112] - - [705, 8305.55] + - [709, 8305.55] - - [512, 512, 64, 18] - - [698, 4324.02] + - [702, 4324.02] - - [256, 256, 64, 18] - - [661, 3547.81] + - [665, 3547.81] - - [256, 256, 64, 1568] - - [670, 10141.7] + - [674, 10141.7] - - [64, 96, 36, 1568] - - [691, 6805.66] + - [695, 6805.66] - - [384, 256, 36, 4096] - - [705, 9311.1] + - [709, 9311.1] - - [256, 512, 64, 800] - - [680, 9998.35] + - [684, 9998.35] - - [256, 384, 36, 2048] - - [705, 9285.34] + - [709, 9285.34] - - [3, 64, 36, 200704] - - [687, 547.375] + - [691, 547.375] - - [384, 384, 64, 2304] - - [628, 9901.68] + - [632, 9901.68] - - [160, 320, 64, 128] - - [664, 7113.81] + - [668, 7113.81] - - [512, 512, 36, 528] - - [634, 9567.65] + - [638, 9567.65] - - [160, 320, 36, 128] - - [665, 6411.13] + - [669, 6411.13] - - [96, 96, 64, 800] - - [673, 7690.01] + - [677, 7690.01] - - [256, 512, 36, 49] - - [661, 6721.25] + - [665, 6721.25] - - [384, 384, 64, 450] - - [634, 9523.53] + - [638, 9523.53] - - [3, 64, 64, 23104] - - [686, 801.621] + - [690, 801.621] - - [256, 256, 64, 3200] - - [670, 10300.4] + - [674, 10300.4] - - [128, 192, 36, 512] - - [648, 7499.75] + - [652, 7499.75] - - [192, 192, 64, 288] - - [705, 8774.24] + - [709, 8774.24] - - [96, 208, 64, 242] - - [665, 5901.99] + - [669, 5901.99] - - [256, 16, 36, 3200] - - [694, 3807.77] + - [698, 3807.77] - - [512, 512, 64, 8] - - [672, 2379.75] + - [676, 2379.75] - - [64, 128, 64, 5776] - - [645, 9332.74] + - [649, 9332.74] - - [512, 512, 64, 288] - - [634, 9521.99] + - [638, 9521.99] - - [256, 16, 36, 32] - - [690, 766.005] + - [694, 766.005] - - [128, 192, 64, 288] - - [705, 8527.58] + - [709, 8527.58] - - [32, 64, 64, 640] - - [673, 4660.34] + - [677, 4660.34] - - [64, 64, 36, 392] - - [673, 3686.4] + - [677, 3686.4] - - [384, 384, 36, 1024] - - [639, 9282.48] + - [643, 9282.48] - - [64, 64, 36, 11552] - - [703, 5904.78] + - [707, 5904.78] - - [96, 128, 36, 6272] - - [693, 8350.99] + - [697, 8350.99] - - [128, 256, 36, 16] - - [675, 2144.81] + - [679, 2144.81] - - [256, 256, 64, 288] - - [705, 9140.13] + - [709, 9140.13] - - [64, 64, 64, 1652] - - [691, 7766.53] + - [695, 7766.53] - - [256, 384, 36, 1024] - - [639, 9203.27] + - [643, 9203.27] - - [96, 128, 64, 3200] - - [708, 8866.2] + - [712, 8866.2] - - [256, 324, 36, 3200] - - [647, 8194.25] + - [651, 8194.25] - - [128, 192, 64, 800] - - [705, 9198.03] + - [709, 9198.03] - - [64, 128, 64, 10] - - [646, 851.117] + - [650, 851.117] - - [96, 208, 64, 288] - - [673, 6667.58] + - [677, 6667.58] - - [64, 96, 36, 2592] - - [655, 7216.88] + - [659, 7216.88] - - [64, 128, 64, 160] - - [684, 5190.97] + - [688, 5190.97] - - [192, 384, 64, 512] - - [634, 9446.04] + - [638, 9446.04] - - [64, 64, 36, 6272] - - [643, 6212.01] + - [647, 6212.01] - - [512, 24, 36, 288] - - [640, 3922.47] + - [644, 3922.47] - - [128, 128, 64, 1568] - - [634, 9037.86] + - [638, 9037.86] - - [112, 224, 64, 242] - - [704, 6399.26] + - [708, 6399.26] - - [128, 256, 64, 1600] - - [670, 10010.3] + - [674, 10010.3] - - [32, 32, 64, 20000] - - [638, 4378.41] + - [642, 4378.41] - - [160, 192, 64, 288] - - [665, 7803.63] + - [669, 7803.63] - - [512, 24, 64, 128] - - [626, 3733.8] + - [630, 3733.8] - - [512, 512, 36, 32] - - [661, 5935.34] + - [665, 5935.34] - - [3, 64, 36, 100352] - - [658, 542.783] + - [662, 542.783] - - [3, 64, 64, 1444] - - [687, 674.159] + - [691, 674.159] - - [512, 512, 36, 3136] - - [628, 9921.1] + - [632, 9921.1] - - [128, 256, 64, 6400] - - [688, 10349.3] + - [692, 10349.3] - - [256, 256, 36, 2048] - - [705, 9518.99] + - [709, 9518.99] - - [128, 160, 64, 288] - - [648, 7549.75] + - [652, 7549.75] - - [256, 256, 64, 6400] - - [670, 10392.6] + - [674, 10392.6] - - [32, 64, 64, 20000] - - [696, 6493.86] + - [700, 6493.86] - - [256, 256, 36, 1680] - - [645, 9513.29] + - [649, 9513.29] - - [128, 128, 64, 210] - - [705, 7094.1] + - [709, 7094.1] - - [192, 384, 36, 2048] - - [634, 8818.65] + - [638, 8818.65] - - [256, 256, 64, 144] - - [705, 8608.61] + - [709, 8608.61] - - [384, 384, 36, 4096] - - [639, 9356.94] + - [643, 9356.94] - - [160, 320, 64, 1152] - - [665, 8749.48] + - [669, 8749.48] - - [384, 256, 36, 2048] - - [705, 9279.63] + - [709, 9279.63] - - [256, 512, 36, 392] - - [705, 9252.14] + - [709, 9252.14] - - [256, 512, 64, 50] - - [661, 7511.29] + - [665, 7511.29] - - [73, 192, 36, 23360] - - [709, 5802.93] + - [713, 5802.93] - - [3, 64, 36, 50176] - - [658, 542.037] + - [662, 542.037] - - [384, 384, 36, 2048] - - [634, 9325.8] + - [638, 9325.8] - - [256, 384, 64, 450] - - [680, 9528.66] + - [684, 9528.66] - - [192, 320, 64, 128] - - [639, 8399.81] + - [643, 8399.81] - - [128, 256, 36, 32] - - [654, 3276.8] + - [658, 3276.8] - - [160, 192, 36, 512] - - [693, 7752.34] + - [697, 7752.34] - - [512, 512, 64, 256] - - [645, 9473.64] + - [649, 9473.64] - - [256, 512, 64, 32] - - [683, 6391.32] + - [687, 6391.32] - - [384, 384, 64, 576] - - [634, 9614.79] + - [638, 9614.79] - - [64, 64, 64, 648] - - [691, 6282.15] + - [695, 6282.15] - - [512, 486, 36, 288] - - [705, 8624.93] + - [709, 8624.93] - - [32, 64, 36, 1440] - - [643, 3961.5] + - [647, 3961.5] - - [144, 288, 64, 242] - - [665, 6347.02] + - [669, 6347.02] - - [384, 256, 64, 576] - - [670, 9775.24] + - [674, 9775.24] - - [512, 512, 36, 64] - - [641, 7791.28] + - [645, 7791.28] - - [448, 384, 64, 128] - - [634, 9132.23] + - [638, 9132.23] - - [64, 128, 64, 722] - - [684, 8047.11] + - [688, 8047.11] - - [144, 288, 64, 288] - - [693, 6859.4] + - [697, 6859.4] - - [512, 512, 64, 224] - - [705, 9427.29] + - [709, 9427.29] - - [112, 224, 64, 288] - - [704, 6736.92] + - [708, 6736.92] - - [384, 384, 64, 1152] - - [628, 9820.46] + - [632, 9820.46] - - [448, 384, 36, 128] - - [705, 8761.31] + - [709, 8761.31] - - [64, 64, 64, 100] - - [651, 2708.1] + - [655, 2708.1] - - [256, 486, 36, 128] - - [677, 7640.04] + - [681, 7640.04] - - [64, 96, 64, 4608] - - [706, 8351.49] + - [710, 8351.49] - - [16, 32, 64, 160] - - [630, 736.36] + - [634, 736.36] - - [64, 192, 36, 6272] - - [706, 8041.19] + - [710, 8041.19] - - [64, 64, 64, 200] - - [659, 3924.31] + - [663, 3924.31] - - [256, 256, 36, 800] - - [705, 9299.55] + - [709, 9299.55] - - [64, 128, 36, 6272] - - [703, 6816.36] + - [707, 6816.36] - - [32, 64, 64, 40] - - [650, 885.622] + - [654, 885.622] - - [256, 16, 64, 32] - - [700, 1205.26] + - [704, 1205.26] - - [192, 384, 36, 800] - - [639, 8673.88] + - [643, 8673.88] - - [128, 128, 36, 3200] - - [673, 8538.89] + - [677, 8538.89] - - [256, 256, 36, 256] - - [645, 8454.36] + - [649, 8454.36] - - [192, 384, 64, 1152] - - [634, 9589.01] + - [638, 9589.01] - - [128, 256, 64, 200] - - [644, 8141.12] + - [648, 8141.12] - - [64, 96, 64, 1152] - - [673, 7620.88] + - [677, 7620.88] - - [128, 128, 36, 392] - - [648, 6175.51] + - [652, 6175.51] - - [80, 192, 36, 10368] - - [696, 6497.16] + - [700, 6497.16] - - [224, 224, 36, 128] - - [706, 5826.89] + - [710, 5826.89] - - [512, 512, 64, 28] - - [661, 5728.81] + - [665, 5728.81] - - [256, 16, 64, 1568] - - [676, 4637.2] + - [680, 4637.2] - - [144, 288, 64, 1152] - - [693, 7784.24] + - [697, 7784.24] - - [256, 256, 64, 576] - - [670, 9596.12] + - [674, 9596.12] - - [64, 128, 36, 784] - - [706, 6058.99] + - [710, 6058.99] - - [256, 24, 36, 128] - - [640, 2239.84] + - [644, 2239.84] - - [256, 256, 64, 2304] - - [670, 10225.7] + - [674, 10225.7] - - [192, 384, 36, 512] - - [705, 8549.03] + - [709, 8549.03] - - [16, 32, 64, 2560] - - [658, 2153.13] + - [662, 2153.13] - - [256, 512, 36, 32] - - [683, 5702.23] + - [687, 5702.23] - - [512, 512, 64, 128] - - [705, 9084.11] + - [709, 9084.11] - - [128, 128, 64, 200] - - [642, 6971.91] + - [646, 6971.91] - - [512, 512, 64, 32] - - [654, 6248.5] + - [658, 6248.5] - - [128, 256, 36, 196] - - [654, 6628.76] + - [658, 6628.76] - - [8, 384, 64, 6600] - - [686, 2733.89] + - [690, 2733.89] - - [149, 32, 64, 8195] - - [648, 6050.91] + - [652, 6050.91] - - [35, 96, 64, 6160] - - [693, 4689.35] + - [697, 4689.35] - - [64, 64, 36, 1760] - - [643, 5622.24] + - [647, 5622.24] - - [196, 528, 32, 32] - - [726, 4088.41] + - [730, 4088.41] - - [5329, 64, 32, 80] - - [719, 8331.14] + - [723, 8331.14] - - [64, 2880, 1, 320] - - [770, 4362.6] + - [774, 4362.6] - - [49, 832, 32, 256] - - [733, 5618.63] + - [737, 5618.63] - - [196, 512, 32, 24] - - [720, 3621.73] + - [724, 3621.73] - - [289, 1120, 1, 160] - - [716, 3302.86] + - [720, 3302.86] - - [1225, 192, 32, 32] - - [724, 6194.57] + - [728, 6194.57] - - [64, 2048, 32, 384] - - [747, 9541.54] + - [751, 9541.54] - - [1001, 1536, 1, 32] - - [718, 3575.67] + - [722, 3575.67] - - [289, 1792, 1, 320] - - [741, 5140.33] + - [745, 5140.33] - - [1001, 1024, 1, 32] - - [713, 2733.4] + - [717, 2733.4] - - [196, 480, 32, 64] - - [774, 5070.42] + - [778, 5070.42] - - [64, 1728, 1, 320] - - [771, 3205.57] + - [775, 3205.57] - - [49, 832, 32, 160] - - [775, 4988.82] + - [779, 4988.82] - - [49, 832, 32, 384] - - [733, 5901.95] + - [737, 5901.95] - - [289, 896, 1, 192] - - [759, 3452.59] + - [763, 3452.59] - - [289, 1024, 32, 384] - - [778, 8902.42] + - [782, 8902.42] - - [784, 192, 32, 96] - - [789, 7853.63] + - [793, 7853.63] - - [50176, 256, 1, 128] - - [752, 9041.83] + - [756, 9041.83] - - [289, 1024, 32, 256] - - [787, 8660.72] + - [791, 8660.72] - - [289, 1024, 32, 192] - - [776, 8433.35] + - [780, 8433.35] - - [12544, 512, 1, 256] - - [736, 9187.34] + - [740, 9187.34] - - [1225, 1728, 1, 192] - - [740, 7720.85] + - [744, 7720.85] - - [196, 480, 32, 96] - - [785, 5662.5] + - [789, 5662.5] - - [196, 512, 32, 144] - - [779, 6531.38] + - [783, 6531.38] - - [784, 400, 1, 32] - - [714, 1280.0] + - [718, 1280.0] - - [289, 768, 32, 128] - - [780, 7913.61] + - [784, 7913.61] - - [5329, 576, 1, 96] - - [723, 7563.46] + - [727, 7563.46] - - [49, 1200, 1, 128] - - [767, 1011.61] + - [771, 1011.61] - - [64, 1536, 32, 256] - - [781, 9159.54] + - [785, 9159.54] - - [289, 2592, 1, 384] - - [749, 6002.71] + - [753, 6002.71] - - [196, 528, 32, 128] - - [784, 5987.1] + - [788, 5987.1] - - [64, 2048, 32, 448] - - [747, 9669.87] + - [751, 9669.87] - - [5329, 448, 1, 64] - - [719, 6201.02] + - [723, 6201.02] - - [784, 256, 32, 64] - - [721, 7623.18] + - [725, 7623.18] - - [784, 192, 32, 32] - - [726, 5874.26] + - [730, 5874.26] - - [21609, 288, 1, 32] - - [739, 5296.5] + - [743, 5296.5] - - [784, 256, 32, 32] - - [717, 6235.46] + - [721, 6235.46] - - [5041, 720, 1, 192] - - [735, 8140.98] + - [739, 8140.98] - - [289, 2016, 1, 256] - - [732, 5404.05] + - [736, 5404.05] - - [196, 512, 32, 128] - - [777, 6366.82] + - [781, 6366.82] - - [289, 768, 32, 160] - - [779, 8253.88] + - [783, 8253.88] - - [64, 1536, 32, 384] - - [750, 9508.5] + - [754, 9508.5] - - [64, 1280, 32, 320] - - [750, 9070.73] + - [754, 9070.73] - - [289, 896, 1, 128] - - [760, 2917.68] + - [764, 2917.68] - - [289, 3456, 1, 384] - - [740, 7274.91] + - [744, 7274.91] - - [196, 800, 1, 64] - - [762, 1393.78] + - [766, 1393.78] - - [64, 1280, 32, 384] - - [746, 9225.01] + - [750, 9225.01] - - [64, 1344, 1, 512] - - [765, 3041.45] + - [769, 3041.45] - - [1001, 4096, 1, 512] - - [746, 9391.77] + - [750, 9391.77] - - [1225, 192, 32, 64] - - [719, 7729.29] + - [723, 7729.29] - - [64, 1152, 1, 384] - - [769, 2440.65] + - [773, 2440.65] - - [729, 1600, 1, 192] - - [731, 6827.71] + - [735, 6827.71] - - [289, 1344, 1, 192] - - [729, 4439.04] + - [733, 4439.04] - - [784, 192, 32, 16] - - [756, 3663.04] + - [760, 3663.04] - - [3136, 1024, 1, 2048] - - [738, 9071.77] + - [742, 9071.77] - - [64, 1152, 1, 448] - - [766, 2564.45] + - [770, 2564.45] - - [49, 832, 32, 128] - - [729, 4733.16] + - [733, 4733.16] - - [784, 256, 32, 128] - - [742, 8471.6] + - [746, 8471.6] - - [49, 800, 1, 128] - - [764, 633.535] + - [768, 633.535] - - [196, 512, 32, 32] - - [726, 4354.26] + - [730, 4354.26] - - [1225, 384, 32, 96] - - [743, 8751.63] + - [747, 8751.63] - - [5041, 576, 1, 96] - - [725, 7067.63] + - [729, 7067.63] - - [49, 832, 32, 48] - - [758, 3316.72] + - [762, 3316.72] - - [5329, 160, 32, 64] - - [782, 8159.84] + - [786, 8159.84] - - [1225, 288, 32, 48] - - [772, 6673.65] + - [776, 6673.65] - - [4096, 9216, 1, 512] - - [754, 10116.9] + - [758, 10116.9] - - [196, 480, 32, 192] - - [783, 6388.46] + - [787, 6388.46] - - [64, 1152, 1, 256] - - [770, 1982.6] + - [774, 1982.6] - - [3136, 1024, 1, 512] - - [738, 8745.57] + - [742, 8745.57] - - [49, 832, 32, 32] - - [757, 2717.87] + - [761, 2717.87] - - [784, 192, 32, 64] - - [721, 7216.32] + - [725, 7216.32] - - [289, 1024, 32, 128] - - [744, 7970.5] + - [748, 7970.5] - - [289, 768, 32, 192] - - [788, 8327.27] + - [792, 8327.27] - - [289, 1120, 1, 192] - - [728, 3716.9] + - [732, 3716.9] - - [196, 512, 32, 112] - - [734, 6252.81] + - [738, 6252.81] - - [1001, 2048, 1, 32] - - [722, 4000.09] + - [726, 4000.09] - - [1225, 288, 32, 64] - - [782, 7208.04] + - [786, 7208.04] - - [196, 600, 1, 64] - - [761, 1093.95] + - [765, 1093.95] - - [1225, 384, 32, 192] - - [743, 9332.66] + - [747, 9332.66] - - [50176, 256, 1, 512] - - [753, 9833.54] + - [757, 9833.54] - - [196, 512, 32, 160] - - [780, 6614.34] + - [784, 6614.34] - - [4096, 4096, 1, 512] - - [751, 10032.2] + - [755, 10032.2] - - [49, 832, 32, 192] - - [729, 5244.53] + - [733, 5244.53] - - [1225, 256, 32, 64] - - [719, 7972.35] + - [723, 7972.35] - - [64, 2048, 32, 320] - - [747, 9404.27] + - [751, 9404.27] - - [196, 480, 32, 16] - - [773, 2724.49] + - [777, 2724.49] - - [1225, 256, 32, 48] - - [721, 7100.38] + - [725, 7100.38] - - [64, 1280, 32, 448] - - [746, 9344.41] + - [750, 9344.41] - - [1225, 1200, 1, 64] - - [715, 5157.89] + - [719, 5157.89] - - [1225, 384, 32, 64] - - [719, 8219.96] + - [723, 8219.96] - - [12544, 512, 1, 1024] - - [738, 9672.72] + - [742, 9672.72] - - [64, 1280, 32, 192] - - [734, 8525.01] + - [738, 8525.01] - - [196, 512, 32, 64] - - [719, 5489.34] + - [723, 5489.34] - - [289, 1792, 1, 256] - - [737, 4831.61] + - [741, 4831.61] - - [196, 528, 32, 256] - - [755, 6453.82] + - [759, 6453.82] - - [64, 2048, 32, 192] - - [742, 8955.81] + - [746, 8955.81] - - [196, 528, 32, 160] - - [783, 6161.15] + - [787, 6161.15] - - [1225, 192, 32, 48] - - [719, 7236.92] + - [723, 7236.92] - - [64, 1728, 1, 192] - - [769, 2480.57] + - [773, 2480.57] - - [1001, 2048, 1, 64] - - [795, 5714.42] + - [799, 5714.42] - - [5329, 64, 128, 80] - - [802, 8835.29] + - [806, 8835.29] - - [64, 1280, 128, 448] - - [800, 10020.5] + - [804, 10020.5] - - [289, 768, 128, 128] - - [803, 8542.71] + - [807, 8542.71] - - [1225, 192, 128, 64] - - [792, 8444.77] + - [796, 8444.77] - - [1225, 288, 128, 48] - - [805, 7244.66] + - [809, 7244.66] - - [289, 768, 128, 192] - - [807, 8794.49] + - [811, 8794.49] - - [289, 768, 128, 160] - - [804, 8705.33] + - [808, 8705.33] - - [64, 2048, 128, 192] - - [798, 9780.26] + - [802, 9780.26] - - [64, 1280, 128, 384] - - [801, 9950.9] + - [805, 9950.9] - - [1225, 256, 128, 48] - - [793, 8273.61] + - [797, 8273.61] - - [1225, 192, 128, 48] - - [793, 8140.32] + - [797, 8140.32] - - [1225, 288, 128, 64] - - [805, 7886.21] + - [809, 7886.21] - - [64, 1280, 128, 320] - - [797, 9894.56] + - [801, 9894.56] - - [1225, 256, 128, 64] - - [798, 8572.51] + - [802, 8572.51] - - [1001, 2048, 1, 128] - - [799, 7289.06] + - [803, 7289.06] - - [1225, 192, 128, 32] - - [794, 7104.57] + - [798, 7104.57] - - [64, 1280, 128, 192] - - [806, 9642.08] + - [810, 9642.08] - - [1001, 1536, 1, 64] - - [796, 5146.56] + - [800, 5146.56] - - [2048, 2048, 1, 1024] - - [810, 9940.21] + - [814, 9940.21] - - [3200, 2048, 1, 1024] - - [809, 9899.24] + - [813, 9899.24] - - [4096, 4096, 1, 1024] - - [811, 10222.2] + - [815, 10222.2] - - [2048, 256, 1, 1024] - - [808, 8452.0] + - [812, 8452.0] - - [257, 4096, 1, 1024] - - [809, 8353.5] + - [813, 8353.5] - - [64, 2048, 64, 192] - - [814, 9434.24] + - [818, 9434.24] - - [1225, 192, 64, 48] - - [817, 7799.38] + - [821, 7799.38] - - [1225, 288, 64, 48] - - [819, 7030.37] + - [823, 7030.37] - - [3136, 64, 64, 64] - - [812, 7941.3] + - [816, 7941.3] - - [1225, 192, 64, 32] - - [818, 6772.91] + - [822, 6772.91] - - [1225, 256, 64, 48] - - [817, 8022.81] + - [821, 8022.81] - - [64, 2048, 64, 384] - - [813, 9859.28] + - [817, 9859.28] - - [64, 1280, 64, 384] - - [813, 9675.44] + - [817, 9675.44] - - [64, 1280, 64, 192] - - [813, 9320.68] + - [817, 9320.68] - - [1225, 192, 64, 64] - - [817, 8180.87] + - [821, 8180.87] - - [3136, 256, 64, 64] - - [814, 8966.88] + - [818, 8966.88] - - [1225, 288, 64, 64] - - [819, 7567.39] + - [823, 7567.39] - - [5329, 64, 64, 80] - - [816, 8634.33] + - [820, 8634.33] - - [64, 1280, 64, 448] - - [813, 9702.62] + - [817, 9702.62] - - [1225, 256, 64, 64] - - [817, 8306.43] + - [821, 8306.43] - - [3136, 64, 64, 256] - - [815, 9431.89] + - [819, 9431.89] - - [64, 1280, 64, 320] - - [813, 9754.2] + - [817, 9754.2] - - [64, 2048, 64, 320] - - [813, 9765.55] + - [817, 9765.55] - - [64, 2048, 64, 448] - - [813, 9948.37] + - [817, 9948.37] + - - [65, 1024, 1, 6400] + - [824, 3556.98] + - - [256, 4096, 1, 6400] + - [825, 10132.4] + - - [1024, 4096, 1, 64] + - [826, 6918.44] + - - [1024, 4096, 1, 6336] + - [827, 10393.9] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml index 9af2a05c1..e740ea571 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -116061,6 +116061,543 @@ WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116189,7 +116726,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 + SolutionIndex: 725 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116338,7 +116875,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 + SolutionIndex: 726 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116487,7 +117024,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 + SolutionIndex: 727 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -116632,7 +117169,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 + SolutionIndex: 728 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116777,7 +117314,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 + SolutionIndex: 729 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -116922,7 +117459,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 + SolutionIndex: 730 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -117067,7 +117604,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 + SolutionIndex: 731 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -117216,7 +117753,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 + SolutionIndex: 732 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -117361,7 +117898,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 + SolutionIndex: 733 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -117506,7 +118043,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 + SolutionIndex: 734 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -117655,7 +118192,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 + SolutionIndex: 735 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 SubGroup0: 8 SubGroup1: 4 @@ -117800,7 +118337,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 + SolutionIndex: 736 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -117945,7 +118482,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 + SolutionIndex: 737 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -118090,7 +118627,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 + SolutionIndex: 738 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -118239,7 +118776,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 + SolutionIndex: 739 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -118388,7 +118925,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 + SolutionIndex: 740 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 SubGroup0: 12 SubGroup1: 16 @@ -118537,7 +119074,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 + SolutionIndex: 741 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -118686,7 +119223,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 + SolutionIndex: 742 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -118835,7 +119372,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 + SolutionIndex: 743 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -118984,7 +119521,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 + SolutionIndex: 744 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -119133,7 +119670,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 + SolutionIndex: 745 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -119282,7 +119819,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 + SolutionIndex: 746 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119431,7 +119968,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 + SolutionIndex: 747 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119580,7 +120117,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 + SolutionIndex: 748 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119729,7 +120266,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 + SolutionIndex: 749 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -119878,7 +120415,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 + SolutionIndex: 750 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120027,7 +120564,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 748 + SolutionIndex: 751 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120176,7 +120713,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 749 + SolutionIndex: 752 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120325,7 +120862,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 750 + SolutionIndex: 753 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120474,7 +121011,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 751 + SolutionIndex: 754 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120623,7 +121160,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 752 + SolutionIndex: 755 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120772,7 +121309,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 753 + SolutionIndex: 756 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -120921,7 +121458,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 754 + SolutionIndex: 757 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121070,7 +121607,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 755 + SolutionIndex: 758 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121219,7 +121756,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 756 + SolutionIndex: 759 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121368,7 +121905,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 757 + SolutionIndex: 760 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121517,7 +122054,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 758 + SolutionIndex: 761 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121666,7 +122203,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 759 + SolutionIndex: 762 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121815,7 +122352,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 760 + SolutionIndex: 763 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121964,7 +122501,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 761 + SolutionIndex: 764 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -122113,7 +122650,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 762 + SolutionIndex: 765 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122262,7 +122799,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 763 + SolutionIndex: 766 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -122411,7 +122948,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 764 + SolutionIndex: 767 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122560,7 +123097,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 765 + SolutionIndex: 768 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122709,7 +123246,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 766 + SolutionIndex: 769 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -122858,7 +123395,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 767 + SolutionIndex: 770 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -123007,7 +123544,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 768 + SolutionIndex: 771 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123156,7 +123693,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 769 + SolutionIndex: 772 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123305,7 +123842,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 770 + SolutionIndex: 773 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123454,7 +123991,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 771 + SolutionIndex: 774 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123603,7 +124140,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 772 + SolutionIndex: 775 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -123752,7 +124289,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 773 + SolutionIndex: 776 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -123901,7 +124438,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 774 + SolutionIndex: 777 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -124050,7 +124587,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 775 + SolutionIndex: 778 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -124199,7 +124736,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 776 + SolutionIndex: 779 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -124348,7 +124885,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 777 + SolutionIndex: 780 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -124497,7 +125034,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 778 + SolutionIndex: 781 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124646,7 +125183,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 779 + SolutionIndex: 782 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124795,7 +125332,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 780 + SolutionIndex: 783 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124944,7 +125481,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 781 + SolutionIndex: 784 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -125093,7 +125630,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 782 + SolutionIndex: 785 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -125242,7 +125779,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 783 + SolutionIndex: 786 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125391,7 +125928,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 784 + SolutionIndex: 787 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125540,7 +126077,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 785 + SolutionIndex: 788 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125689,7 +126226,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 786 + SolutionIndex: 789 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -125838,7 +126375,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 787 + SolutionIndex: 790 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -125987,7 +126524,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 788 + SolutionIndex: 791 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126136,7 +126673,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 789 + SolutionIndex: 792 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126285,7 +126822,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 790 + SolutionIndex: 793 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -126434,7 +126971,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 791 + SolutionIndex: 794 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -126583,7 +127120,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 792 + SolutionIndex: 795 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -126732,7 +127269,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 793 + SolutionIndex: 796 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126881,7 +127418,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 794 + SolutionIndex: 797 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -127030,7 +127567,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 795 + SolutionIndex: 798 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -127179,7 +127716,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 796 + SolutionIndex: 799 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -127328,7 +127865,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 797 + SolutionIndex: 800 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -127477,7 +128014,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 798 + SolutionIndex: 801 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127626,7 +128163,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 799 + SolutionIndex: 802 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127775,7 +128312,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 800 + SolutionIndex: 803 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127924,7 +128461,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 801 + SolutionIndex: 804 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -128073,7 +128610,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 802 + SolutionIndex: 805 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128222,7 +128759,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 803 + SolutionIndex: 806 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128371,7 +128908,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 804 + SolutionIndex: 807 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128520,7 +129057,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 805 + SolutionIndex: 808 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -128669,7 +129206,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 806 + SolutionIndex: 809 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -128818,7 +129355,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 807 + SolutionIndex: 810 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -128967,7 +129504,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 808 + SolutionIndex: 811 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129116,7 +129653,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 809 + SolutionIndex: 812 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129265,7 +129802,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 810 + SolutionIndex: 813 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129414,7 +129951,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 811 + SolutionIndex: 814 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -129563,7 +130100,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 812 + SolutionIndex: 815 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -129712,7 +130249,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 813 + SolutionIndex: 816 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -129861,7 +130398,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 814 + SolutionIndex: 817 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130010,7 +130547,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 815 + SolutionIndex: 818 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130159,7 +130696,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 816 + SolutionIndex: 819 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130308,7 +130845,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 817 + SolutionIndex: 820 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130457,7 +130994,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 818 + SolutionIndex: 821 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130606,7 +131143,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 819 + SolutionIndex: 822 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -130755,7 +131292,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 820 + SolutionIndex: 823 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130904,7 +131441,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 821 + SolutionIndex: 824 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -131049,7 +131586,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 822 + SolutionIndex: 825 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131194,7 +131731,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 823 + SolutionIndex: 826 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131343,7 +131880,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 824 + SolutionIndex: 827 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131492,7 +132029,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 825 + SolutionIndex: 828 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131637,7 +132174,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 826 + SolutionIndex: 829 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131782,7 +132319,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 827 + SolutionIndex: 830 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131931,7 +132468,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 828 + SolutionIndex: 831 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132076,7 +132613,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 829 + SolutionIndex: 832 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132225,7 +132762,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 830 + SolutionIndex: 833 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132374,7 +132911,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 831 + SolutionIndex: 834 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132519,7 +133056,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 832 + SolutionIndex: 835 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132668,7 +133205,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 833 + SolutionIndex: 836 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132813,7 +133350,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 834 + SolutionIndex: 837 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132958,7 +133495,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 835 + SolutionIndex: 838 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133103,7 +133640,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 836 + SolutionIndex: 839 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133248,7 +133785,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 837 + SolutionIndex: 840 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133397,7 +133934,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 838 + SolutionIndex: 841 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133546,7 +134083,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 839 + SolutionIndex: 842 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133695,7 +134232,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 840 + SolutionIndex: 843 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133840,7 +134377,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 841 + SolutionIndex: 844 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133989,7 +134526,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 842 + SolutionIndex: 845 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134134,7 +134671,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 843 + SolutionIndex: 846 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134283,7 +134820,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 844 + SolutionIndex: 847 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134432,7 +134969,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 845 + SolutionIndex: 848 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134577,7 +135114,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 846 + SolutionIndex: 849 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134726,7 +135263,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 847 + SolutionIndex: 850 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134875,7 +135412,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 848 + SolutionIndex: 851 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135020,7 +135557,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 849 + SolutionIndex: 852 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135169,7 +135706,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 850 + SolutionIndex: 853 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135314,7 +135851,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 851 + SolutionIndex: 854 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135463,7 +136000,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 852 + SolutionIndex: 855 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135608,7 +136145,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 853 + SolutionIndex: 856 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135757,7 +136294,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 854 + SolutionIndex: 857 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135906,7 +136443,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 855 + SolutionIndex: 858 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136051,7 +136588,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 856 + SolutionIndex: 859 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136200,7 +136737,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 857 + SolutionIndex: 860 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136349,7 +136886,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 858 + SolutionIndex: 861 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136494,7 +137031,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 859 + SolutionIndex: 862 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136643,7 +137180,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 860 + SolutionIndex: 863 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136788,7 +137325,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 861 + SolutionIndex: 864 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136937,7 +137474,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 862 + SolutionIndex: 865 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137082,7 +137619,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 863 + SolutionIndex: 866 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137231,7 +137768,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 864 + SolutionIndex: 867 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137376,7 +137913,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 865 + SolutionIndex: 868 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137525,7 +138062,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 866 + SolutionIndex: 869 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137670,7 +138207,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 867 + SolutionIndex: 870 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137819,7 +138356,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 868 + SolutionIndex: 871 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137964,7 +138501,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 869 + SolutionIndex: 872 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138113,7 +138650,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 870 + SolutionIndex: 873 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138258,7 +138795,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 871 + SolutionIndex: 874 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138407,7 +138944,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 872 + SolutionIndex: 875 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138552,7 +139089,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 873 + SolutionIndex: 876 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138701,7 +139238,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 874 + SolutionIndex: 877 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138846,7 +139383,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 875 + SolutionIndex: 878 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138995,7 +139532,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 876 + SolutionIndex: 879 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -139144,7 +139681,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 877 + SolutionIndex: 880 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139293,7 +139830,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 878 + SolutionIndex: 881 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139442,7 +139979,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 879 + SolutionIndex: 882 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -139591,7 +140128,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 880 + SolutionIndex: 883 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139740,7 +140277,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 881 + SolutionIndex: 884 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -139889,7 +140426,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 882 + SolutionIndex: 885 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -140038,7 +140575,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 883 + SolutionIndex: 886 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -140187,7 +140724,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 884 + SolutionIndex: 887 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -140336,7 +140873,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 885 + SolutionIndex: 888 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -140485,7 +141022,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 886 + SolutionIndex: 889 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -140634,7 +141171,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 887 + SolutionIndex: 890 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -140783,7 +141320,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 888 + SolutionIndex: 891 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -140932,7 +141469,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 889 + SolutionIndex: 892 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -141081,7 +141618,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 890 + SolutionIndex: 893 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141226,7 +141763,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 891 + SolutionIndex: 894 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 SubGroup0: 8 SubGroup1: 32 @@ -141371,7 +141908,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 892 + SolutionIndex: 895 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141516,7 +142053,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 893 + SolutionIndex: 896 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141661,7 +142198,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 894 + SolutionIndex: 897 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -141806,7 +142343,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 895 + SolutionIndex: 898 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -141951,7 +142488,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 896 + SolutionIndex: 899 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -142096,7 +142633,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 897 + SolutionIndex: 900 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -142241,7 +142778,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 898 + SolutionIndex: 901 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -142383,7 +142920,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 899 + SolutionIndex: 902 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 SubGroup0: 32 SubGroup1: 8 @@ -142529,7 +143066,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 900 + SolutionIndex: 903 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -142675,7 +143212,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 901 + SolutionIndex: 904 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -142821,7 +143358,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 902 + SolutionIndex: 905 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -142967,7 +143504,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 903 + SolutionIndex: 906 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 SubGroup1: 8 @@ -143109,7 +143646,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 904 + SolutionIndex: 907 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 SubGroup1: 32 @@ -143255,7 +143792,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 905 + SolutionIndex: 908 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143397,7 +143934,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 906 + SolutionIndex: 909 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143539,7 +144076,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 907 + SolutionIndex: 910 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143685,7 +144222,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 908 + SolutionIndex: 911 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143831,7 +144368,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 909 + SolutionIndex: 912 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -143977,7 +144514,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 910 + SolutionIndex: 913 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 SubGroup0: 32 SubGroup1: 8 @@ -144134,7 +144671,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 + SolutionIndex: 914 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -144296,7 +144833,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 + SolutionIndex: 915 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -144458,7 +144995,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 + SolutionIndex: 916 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -144620,7 +145157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 + SolutionIndex: 917 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -144782,7 +145319,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 + SolutionIndex: 918 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -144944,7 +145481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 + SolutionIndex: 919 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145106,7 +145643,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 + SolutionIndex: 920 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145268,7 +145805,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 + SolutionIndex: 921 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145430,7 +145967,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 + SolutionIndex: 922 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145592,7 +146129,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 + SolutionIndex: 923 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145754,7 +146291,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 + SolutionIndex: 924 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145916,7 +146453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 + SolutionIndex: 925 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -146078,7 +146615,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 + SolutionIndex: 926 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -146240,7 +146777,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 + SolutionIndex: 927 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146402,7 +146939,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 + SolutionIndex: 928 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146564,7 +147101,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 + SolutionIndex: 929 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146726,7 +147263,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 + SolutionIndex: 930 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146888,7 +147425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 + SolutionIndex: 931 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147050,7 +147587,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 + SolutionIndex: 932 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147212,7 +147749,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 + SolutionIndex: 933 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147374,7 +147911,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 + SolutionIndex: 934 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147536,7 +148073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 + SolutionIndex: 935 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147698,7 +148235,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 + SolutionIndex: 936 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147860,7 +148397,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 + SolutionIndex: 937 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148022,7 +148559,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 + SolutionIndex: 938 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148184,7 +148721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 + SolutionIndex: 939 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148348,7 +148885,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 + SolutionIndex: 940 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148512,7 +149049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 + SolutionIndex: 941 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148676,7 +149213,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 + SolutionIndex: 942 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148840,7 +149377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 + SolutionIndex: 943 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -149004,7 +149541,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 + SolutionIndex: 944 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -149168,7 +149705,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 + SolutionIndex: 945 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 StaggerU: 32 StaggerUMapping: 0 @@ -149332,7 +149869,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 + SolutionIndex: 946 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -149496,7 +150033,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 + SolutionIndex: 947 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 StaggerU: 32 StaggerUMapping: 0 @@ -149656,7 +150193,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 + SolutionIndex: 948 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 StaggerU: 32 StaggerUMapping: 0 @@ -149820,7 +150357,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 + SolutionIndex: 949 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -149980,7 +150517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 + SolutionIndex: 950 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -150144,7 +150681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 + SolutionIndex: 951 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -150308,7 +150845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 + SolutionIndex: 952 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -150472,7 +151009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 + SolutionIndex: 953 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -150636,7 +151173,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 + SolutionIndex: 954 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -150800,7 +151337,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 + SolutionIndex: 955 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 StaggerU: 32 StaggerUMapping: 0 @@ -150964,7 +151501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 + SolutionIndex: 956 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -151128,7 +151665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 + SolutionIndex: 957 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -151292,7 +151829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 + SolutionIndex: 958 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -151456,7 +151993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 + SolutionIndex: 959 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -151620,7 +152157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 + SolutionIndex: 960 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151784,7 +152321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 + SolutionIndex: 961 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151948,7 +152485,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 + SolutionIndex: 962 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152112,7 +152649,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 + SolutionIndex: 963 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152276,7 +152813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 + SolutionIndex: 964 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152443,7 +152980,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 + SolutionIndex: 965 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152606,7 +153143,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 + SolutionIndex: 966 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152773,7 +153310,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 + SolutionIndex: 967 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152936,7 +153473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 + SolutionIndex: 968 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153103,7 +153640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 + SolutionIndex: 969 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153266,7 +153803,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 + SolutionIndex: 970 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153433,7 +153970,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 + SolutionIndex: 971 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153596,7 +154133,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 + SolutionIndex: 972 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153763,7 +154300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 + SolutionIndex: 973 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153924,7 +154461,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 + SolutionIndex: 974 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154085,7 +154622,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 + SolutionIndex: 975 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154248,7 +154785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 + SolutionIndex: 976 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154415,7 +154952,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 + SolutionIndex: 977 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154580,7 +155117,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 + SolutionIndex: 978 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154743,7 +155280,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 + SolutionIndex: 979 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154910,7 +155447,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 + SolutionIndex: 980 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155073,7 +155610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 + SolutionIndex: 981 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155240,7 +155777,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 + SolutionIndex: 982 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155403,7 +155940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 + SolutionIndex: 983 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155566,7 +156103,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 + SolutionIndex: 984 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155731,7 +156268,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 + SolutionIndex: 985 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155894,7 +156431,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 + SolutionIndex: 986 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156057,7 +156594,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 + SolutionIndex: 987 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156222,7 +156759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 + SolutionIndex: 988 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156385,7 +156922,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 + SolutionIndex: 989 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156548,7 +157085,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 + SolutionIndex: 990 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156709,7 +157246,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 + SolutionIndex: 991 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156870,7 +157407,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 + SolutionIndex: 992 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157031,7 +157568,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 + SolutionIndex: 993 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157192,7 +157729,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 + SolutionIndex: 994 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157357,7 +157894,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 + SolutionIndex: 995 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157520,7 +158057,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 + SolutionIndex: 996 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157687,7 +158224,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 + SolutionIndex: 997 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157850,7 +158387,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 + SolutionIndex: 998 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158013,7 +158550,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 + SolutionIndex: 999 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158172,7 +158709,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 + SolutionIndex: 1000 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158335,7 +158872,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 + SolutionIndex: 1001 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158496,7 +159033,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 + SolutionIndex: 1002 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158661,7 +159198,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 + SolutionIndex: 1003 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158822,7 +159359,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 + SolutionIndex: 1004 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158983,7 +159520,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 + SolutionIndex: 1005 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159144,7 +159681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 + SolutionIndex: 1006 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159309,7 +159846,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 + SolutionIndex: 1007 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159470,7 +160007,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 + SolutionIndex: 1008 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159631,7 +160168,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 + SolutionIndex: 1009 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159792,7 +160329,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 + SolutionIndex: 1010 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159953,7 +160490,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 + SolutionIndex: 1011 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160114,7 +160651,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 + SolutionIndex: 1012 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160275,7 +160812,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 + SolutionIndex: 1013 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160436,7 +160973,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 + SolutionIndex: 1014 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160597,7 +161134,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 + SolutionIndex: 1015 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160758,7 +161295,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 + SolutionIndex: 1016 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160919,7 +161456,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 + SolutionIndex: 1017 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161080,7 +161617,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 + SolutionIndex: 1018 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161241,7 +161778,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 + SolutionIndex: 1019 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161402,7 +161939,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 + SolutionIndex: 1020 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161563,7 +162100,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 + SolutionIndex: 1021 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161724,7 +162261,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 + SolutionIndex: 1022 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161885,7 +162422,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 + SolutionIndex: 1023 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162044,7 +162581,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 + SolutionIndex: 1024 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162204,7 +162741,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 + SolutionIndex: 1025 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162364,7 +162901,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 + SolutionIndex: 1026 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162524,7 +163061,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 + SolutionIndex: 1027 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162684,7 +163221,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 + SolutionIndex: 1028 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162844,7 +163381,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 + SolutionIndex: 1029 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163004,7 +163541,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 + SolutionIndex: 1030 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163168,7 +163705,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 + SolutionIndex: 1031 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163328,7 +163865,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 + SolutionIndex: 1032 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163488,7 +164025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 + SolutionIndex: 1033 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163648,7 +164185,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 + SolutionIndex: 1034 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163808,7 +164345,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 + SolutionIndex: 1035 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163968,7 +164505,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 + SolutionIndex: 1036 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164128,7 +164665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 + SolutionIndex: 1037 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164292,7 +164829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 + SolutionIndex: 1038 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164452,7 +164989,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 + SolutionIndex: 1039 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164616,7 +165153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 + SolutionIndex: 1040 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164776,7 +165313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 + SolutionIndex: 1041 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164940,7 +165477,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 + SolutionIndex: 1042 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165100,7 +165637,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 + SolutionIndex: 1043 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165260,7 +165797,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 + SolutionIndex: 1044 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165420,7 +165957,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 + SolutionIndex: 1045 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165580,7 +166117,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 + SolutionIndex: 1046 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165740,7 +166277,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 + SolutionIndex: 1047 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165904,7 +166441,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 + SolutionIndex: 1048 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166068,7 +166605,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 + SolutionIndex: 1049 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166232,7 +166769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 + SolutionIndex: 1050 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166392,7 +166929,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 + SolutionIndex: 1051 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166556,7 +167093,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 + SolutionIndex: 1052 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166720,7 +167257,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 + SolutionIndex: 1053 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166880,7 +167417,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 + SolutionIndex: 1054 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167044,7 +167581,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 + SolutionIndex: 1055 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167208,7 +167745,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 + SolutionIndex: 1056 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167368,7 +167905,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 + SolutionIndex: 1057 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167532,7 +168069,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 + SolutionIndex: 1058 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167696,7 +168233,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 + SolutionIndex: 1059 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167860,7 +168397,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 + SolutionIndex: 1060 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168020,7 +168557,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 + SolutionIndex: 1061 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168184,7 +168721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 + SolutionIndex: 1062 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168344,7 +168881,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 + SolutionIndex: 1063 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168508,7 +169045,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 + SolutionIndex: 1064 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168672,7 +169209,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 + SolutionIndex: 1065 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168836,7 +169373,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 + SolutionIndex: 1066 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168996,7 +169533,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 + SolutionIndex: 1067 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169160,7 +169697,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 + SolutionIndex: 1068 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169324,7 +169861,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 + SolutionIndex: 1069 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169488,7 +170025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 + SolutionIndex: 1070 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169652,7 +170189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 + SolutionIndex: 1071 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169816,7 +170353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 + SolutionIndex: 1072 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169980,7 +170517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 + SolutionIndex: 1073 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170144,7 +170681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 + SolutionIndex: 1074 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170308,7 +170845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 + SolutionIndex: 1075 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170472,7 +171009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 + SolutionIndex: 1076 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170632,7 +171169,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 + SolutionIndex: 1077 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170796,7 +171333,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 + SolutionIndex: 1078 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170960,7 +171497,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 + SolutionIndex: 1079 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171124,7 +171661,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 + SolutionIndex: 1080 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171288,7 +171825,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 + SolutionIndex: 1081 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171452,7 +171989,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 + SolutionIndex: 1082 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171616,7 +172153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 + SolutionIndex: 1083 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171776,7 +172313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 + SolutionIndex: 1084 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171936,7 +172473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 + SolutionIndex: 1085 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172096,7 +172633,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 + SolutionIndex: 1086 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172256,7 +172793,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 + SolutionIndex: 1087 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172416,7 +172953,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 + SolutionIndex: 1088 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172576,7 +173113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 + SolutionIndex: 1089 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172736,7 +173273,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 + SolutionIndex: 1090 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172896,7 +173433,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 + SolutionIndex: 1091 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173056,7 +173593,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 + SolutionIndex: 1092 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173216,7 +173753,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 + SolutionIndex: 1093 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173376,7 +173913,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 + SolutionIndex: 1094 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173536,7 +174073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 + SolutionIndex: 1095 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173700,7 +174237,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 + SolutionIndex: 1096 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173864,7 +174401,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 + SolutionIndex: 1097 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174024,7 +174561,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 + SolutionIndex: 1098 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174188,7 +174725,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 + SolutionIndex: 1099 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174352,7 +174889,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 + SolutionIndex: 1100 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174516,7 +175053,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 + SolutionIndex: 1101 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174676,7 +175213,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 + SolutionIndex: 1102 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174840,7 +175377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 + SolutionIndex: 1103 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175000,7 +175537,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 + SolutionIndex: 1104 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175164,7 +175701,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 + SolutionIndex: 1105 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175328,7 +175865,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 + SolutionIndex: 1106 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175488,7 +176025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 + SolutionIndex: 1107 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175652,7 +176189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 + SolutionIndex: 1108 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175816,7 +176353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 + SolutionIndex: 1109 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175980,7 +176517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 + SolutionIndex: 1110 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176144,7 +176681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 + SolutionIndex: 1111 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176308,7 +176845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 + SolutionIndex: 1112 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176472,7 +177009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 + SolutionIndex: 1113 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176636,7 +177173,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 + SolutionIndex: 1114 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176800,7 +177337,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 + SolutionIndex: 1115 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176964,7 +177501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 + SolutionIndex: 1116 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177128,7 +177665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 + SolutionIndex: 1117 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177292,7 +177829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 + SolutionIndex: 1118 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177456,7 +177993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 + SolutionIndex: 1119 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177620,7 +178157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 + SolutionIndex: 1120 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177784,7 +178321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 + SolutionIndex: 1121 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177948,7 +178485,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 + SolutionIndex: 1122 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178112,7 +178649,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 + SolutionIndex: 1123 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178272,7 +178809,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 + SolutionIndex: 1124 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178436,7 +178973,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 + SolutionIndex: 1125 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178600,20 +179137,184 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -178621,10 +179322,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -178764,8 +179465,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178786,7 +179487,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -178808,7 +179509,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -178836,13 +179537,9 @@ LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -178883,8 +179580,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -178928,8 +179625,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178937,7 +179634,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -178950,7 +179647,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -178966,7 +179663,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178992,17 +179689,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179015,7 +179712,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179031,8 +179728,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179088,8 +179785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179112,7 +179809,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179132,37 +179829,205 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179188,11 +180053,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179203,8 +180068,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -179248,8 +180113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179257,7 +180122,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -179268,9 +180133,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -179286,7 +180151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179311,18 +180176,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -179339,10 +180204,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179351,12 +180216,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179412,8 +180277,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179422,10 +180287,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -179434,9 +180299,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179450,7 +180315,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179458,41 +180323,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -179503,7 +180368,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179516,7 +180381,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -179576,8 +180441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179596,11 +180461,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179614,49 +180479,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -179667,7 +180528,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179680,11 +180541,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179695,7 +180556,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -179740,8 +180601,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179749,7 +180610,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -179760,11 +180621,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179904,8 +180765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179926,7 +180787,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -179942,13 +180803,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -179968,34 +180829,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180003,13 +180868,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180019,7 +180884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -180064,16 +180929,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -180085,10 +180950,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -180102,13 +180967,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -180128,21 +180993,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -180155,7 +181016,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -180171,8 +181032,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -180183,7 +181044,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -180228,8 +181089,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180237,7 +181098,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -180250,9 +181111,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -180392,8 +181253,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180414,7 +181275,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -180455,34 +181316,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180491,13 +181352,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180508,7 +181369,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180552,14 +181413,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -180573,7 +181434,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -180615,22 +181476,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -180644,10 +181505,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180716,15 +181577,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -180737,7 +181598,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -180779,34 +181640,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180815,13 +181676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180876,14 +181737,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -180897,8 +181758,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -180920,7 +181781,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -180939,38 +181800,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180979,13 +181836,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180995,7 +181852,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181040,16 +181897,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181061,8 +181918,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -181078,13 +181935,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -181104,30 +181961,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -181139,13 +182000,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181155,8 +182016,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181200,8 +182061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181209,7 +182070,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181221,10 +182082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181238,13 +182099,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -181264,34 +182125,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 32 LVCA: 32 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181299,13 +182164,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181315,7 +182180,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181360,16 +182225,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181381,10 +182246,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181423,38 +182288,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -181463,11 +182328,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 1024 PackBatchDims: 0 @@ -181524,14 +182389,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -181545,7 +182410,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -181589,20 +182454,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -181617,9 +182482,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181631,9 +182496,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181688,15 +182553,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -181709,8 +182574,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -181751,39 +182616,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 64 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181791,13 +182656,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181852,15 +182717,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -181873,8 +182738,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -181890,7 +182755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181898,52 +182763,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -181955,13 +182820,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182016,8 +182881,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182036,11 +182901,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -182054,7 +182919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -182062,33 +182927,33 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 @@ -182096,22 +182961,22 @@ LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182119,13 +182984,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182180,31 +183045,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -182224,7 +183089,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -182252,13 +183117,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182299,8 +183160,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -182344,8 +183205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182353,7 +183214,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -182366,7 +183227,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182407,22 +183268,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182436,9 +183297,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -182447,11 +183308,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -182508,8 +183369,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182518,10 +183379,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -182530,7 +183391,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182552,7 +183413,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -182580,9 +183441,13 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182597,9 +183462,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182607,12 +183472,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -182623,8 +183488,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182668,8 +183533,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182677,12 +183542,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182690,7 +183555,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182731,22 +183596,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182760,9 +183625,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -182771,11 +183636,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -182832,8 +183697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182842,10 +183707,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -182870,7 +183735,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -182896,21 +183761,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182923,11 +183788,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182935,11 +183800,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -182996,8 +183861,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183006,11 +183871,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183018,9 +183883,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183034,7 +183899,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183043,7 +183908,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183059,18 +183924,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -183080,18 +183945,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183099,13 +183964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183160,20 +184025,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183181,10 +184046,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183225,14 +184090,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -183244,14 +184109,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -183263,13 +184128,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183324,28 +184189,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183387,22 +184252,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -183416,9 +184281,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -183427,11 +184292,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -183488,8 +184353,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183498,10 +184363,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183652,28 +184517,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183717,16 +184582,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 @@ -183736,18 +184601,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183759,9 +184624,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183816,20 +184681,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183837,7 +184702,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183863,7 +184728,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183882,19 +184747,19 @@ LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -183909,9 +184774,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183919,8 +184784,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -183980,20 +184845,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184001,8 +184866,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184027,7 +184892,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184043,38 +184908,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 32 LSPB: 64 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -184089,7 +184954,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184144,29 +185009,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184207,39 +185072,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184247,13 +185112,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184308,15 +185173,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -184329,7 +185194,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -184373,14 +185238,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 12416 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -184392,14 +185257,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -184411,13 +185276,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184472,29 +185337,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184535,39 +185400,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184575,13 +185440,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184592,7 +185457,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184636,15 +185501,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -184657,8 +185522,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184701,14 +185566,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -184720,14 +185585,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -184739,13 +185604,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184800,28 +185665,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -184847,7 +185712,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184866,19 +185731,19 @@ LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -184893,9 +185758,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184903,8 +185768,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -184920,7 +185785,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -184964,20 +185829,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184985,7 +185850,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185011,7 +185876,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -185027,38 +185892,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 32 LSPB: 64 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185073,7 +185938,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185128,28 +185993,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185166,49 +186031,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -185219,10 +186080,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185231,13 +186092,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185247,8 +186108,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -185292,31 +186153,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185330,15 +186191,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -185346,47 +186207,43 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 8 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185395,13 +186252,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185411,7 +186268,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185456,16 +186313,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -185476,11 +186333,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185494,59 +186351,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185555,12 +186416,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -185571,8 +186432,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -185616,31 +186477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185679,34 +186540,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185715,8 +186576,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -185776,14 +186637,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -185797,8 +186658,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -185814,49 +186675,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -185867,10 +186724,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185879,12 +186736,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -185895,7 +186752,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185940,8 +186797,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -185949,22 +186806,22 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185984,7 +186841,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186013,8 +186870,12 @@ LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186055,7 +186916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186100,8 +186961,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186109,7 +186970,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186122,7 +186983,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -186138,7 +186999,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186164,17 +187025,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186187,7 +187048,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -186204,7 +187065,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -186260,8 +187121,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186282,9 +187143,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -186298,7 +187159,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186324,21 +187185,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186351,7 +187212,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -186368,7 +187229,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -186424,8 +187285,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186446,9 +187307,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -186487,34 +187348,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -186523,8 +187384,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -186584,14 +187445,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -186605,7 +187466,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -186628,7 +187489,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186657,12 +187518,8 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186703,7 +187560,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186748,8 +187605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186757,7 +187614,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186770,7 +187627,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -186792,7 +187649,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186811,34 +187668,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -186847,8 +187708,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -186863,7 +187724,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186908,16 +187769,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186929,8 +187790,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -186952,7 +187813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186981,8 +187842,12 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -187023,7 +187888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -187068,8 +187933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187077,7 +187942,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -187090,7 +187955,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187106,7 +187971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187131,14 +187996,14 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 512 @@ -187152,18 +188017,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -187171,8 +188036,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -187232,15 +188097,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -187253,10 +188118,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187278,41 +188143,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187324,9 +188189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187335,12 +188200,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187396,8 +188261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187406,19 +188271,19 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187434,7 +188299,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187442,56 +188307,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -187499,12 +188364,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187560,31 +188425,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187724,8 +188589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187746,7 +188611,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187762,7 +188627,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187787,18 +188652,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -187815,10 +188680,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187827,12 +188692,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187888,8 +188753,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1180 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187898,10 +188763,10 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187910,9 +188775,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187926,7 +188791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187951,18 +188816,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -187979,10 +188844,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187991,12 +188856,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188052,8 +188917,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1181 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188062,10 +188927,10 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -188076,7 +188941,7 @@ WorkGroup: [8, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188124,30 +188989,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -188155,12 +189020,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188216,15 +189081,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1182 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -188237,8 +189102,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -188254,7 +189119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188262,56 +189127,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -188319,12 +189184,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188380,15 +189245,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1183 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -188400,11 +189265,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188418,49 +189283,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188471,7 +189332,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188484,7 +189345,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -188499,7 +189360,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -188544,8 +189405,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1184 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188553,7 +189414,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -188564,11 +189425,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188708,8 +189569,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1185 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188730,7 +189591,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -188746,13 +189607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -188772,17 +189633,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -188795,7 +189660,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188811,8 +189676,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188823,7 +189688,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -188868,8 +189733,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1186 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188877,7 +189742,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -188890,9 +189755,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188906,7 +189771,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188932,21 +189797,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -188959,7 +189824,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188975,8 +189840,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189032,8 +189897,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1187 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189054,9 +189919,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189196,8 +190061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1188 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189218,7 +190083,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -189234,7 +190099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189242,7 +190107,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -189250,37 +190115,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -189288,9 +190153,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -189299,12 +190164,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189360,14 +190225,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1189 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -189380,11 +190245,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189398,7 +190263,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189406,7 +190271,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -189414,37 +190279,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -189452,9 +190317,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -189463,12 +190328,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189524,14 +190389,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1190 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -189544,11 +190409,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189688,8 +190553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1191 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189710,7 +190575,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -189735,7 +190600,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -189754,19 +190619,19 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -189781,9 +190646,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -189791,8 +190656,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -189852,8 +190717,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1192 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189862,11 +190727,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -189874,7 +190739,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -189890,7 +190755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189899,7 +190764,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -189916,21 +190781,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -189943,7 +190808,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -189959,7 +190824,7 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -190016,8 +190881,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1193 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190038,9 +190903,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190054,7 +190919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -190063,7 +190928,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190079,18 +190944,182 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6272 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -190100,7 +191129,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190108,10 +191137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -190119,8 +191148,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -190180,31 +191209,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1194 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190344,8 +191373,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1195 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190366,7 +191395,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190388,10 +191417,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190407,22 +191436,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190436,9 +191461,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -190447,13 +191472,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -190463,7 +191488,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190508,29 +191533,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1196 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190672,8 +191697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1197 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190694,7 +191719,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190744,7 +191769,7 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 @@ -190756,14 +191781,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -190836,20 +191861,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1198 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -190857,7 +191882,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -190880,7 +191905,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -190899,34 +191924,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -190935,11 +191964,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -190951,7 +191980,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190996,16 +192025,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1199 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191017,8 +192046,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191160,8 +192189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1200 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191182,7 +192211,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191204,54 +192233,50 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -191264,11 +192289,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191279,7 +192304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191324,29 +192349,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1201 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191368,43 +192393,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191428,11 +192449,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191443,7 +192464,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191488,8 +192509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1202 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191497,7 +192518,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191508,9 +192529,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191534,41 +192555,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191592,11 +192613,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191652,8 +192673,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1203 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191661,7 +192682,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191672,9 +192693,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191696,7 +192717,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -191715,34 +192736,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -191751,11 +192776,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -191767,7 +192792,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191812,14 +192837,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1204 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -191833,8 +192858,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191850,7 +192875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191867,7 +192892,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -191875,20 +192900,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191899,10 +192924,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -191911,12 +192936,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191928,7 +192953,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -191972,31 +192997,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1205 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192010,13 +193035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -192027,7 +193052,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -192036,27 +193061,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -192079,8 +193100,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -192091,7 +193112,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192136,31 +193157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1206 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192174,13 +193195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -192191,7 +193212,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -192200,34 +193221,30 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -192243,8 +193260,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -192255,8 +193272,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -192300,31 +193317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1207 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192338,7 +193355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192363,34 +193380,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192399,8 +193416,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192416,7 +193433,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192460,14 +193477,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1208 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192481,10 +193498,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192532,11 +193549,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 528 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192620,8 +193637,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1209 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192683,34 +193700,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192719,8 +193736,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192780,14 +193797,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1210 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192801,8 +193818,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -192843,34 +193860,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192879,8 +193896,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192940,14 +193957,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1211 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192961,8 +193978,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -192978,13 +193995,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193004,23 +194021,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193055,7 +194076,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193100,20 +194121,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1212 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -193121,10 +194142,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193163,35 +194184,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193199,12 +194220,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -193260,29 +194281,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1213 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -193298,7 +194319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193318,29 +194339,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193349,9 +194370,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193359,7 +194380,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -193420,15 +194441,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1214 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -193441,10 +194462,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193458,7 +194479,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193478,21 +194499,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -193504,7 +194525,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193512,10 +194533,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193523,8 +194544,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -193584,20 +194605,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1215 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -193605,10 +194626,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193622,13 +194643,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193648,34 +194669,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193683,12 +194708,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -193699,8 +194724,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -193744,31 +194769,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1216 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193788,7 +194813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193807,18 +194832,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -193832,10 +194861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193847,7 +194876,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -193859,7 +194888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193904,16 +194933,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1217 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -193925,7 +194954,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -193962,27 +194991,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -193996,9 +195025,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194007,11 +195036,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -194068,28 +195097,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1218 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -194114,7 +195143,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194122,33 +195151,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -194160,9 +195189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194171,7 +195200,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -194232,29 +195261,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1219 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194278,7 +195307,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194286,48 +195315,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194335,13 +195364,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -194396,29 +195425,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1220 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194442,7 +195471,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194450,33 +195479,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -194488,9 +195517,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194499,7 +195528,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -194560,8 +195589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1221 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194570,19 +195599,19 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194724,8 +195753,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1222 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194746,7 +195775,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194888,8 +195917,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1223 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194910,171 +195939,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [32, 8, 2] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 4 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 4 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1224 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -195082,7 +195947,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -195090,16 +195955,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195114,40 +195979,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195155,11 +196021,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195171,6 +196039,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -195178,6 +196047,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -195216,16 +196086,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1225 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -195237,16 +196107,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -195254,7 +196122,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195263,7 +196131,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195278,29 +196146,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195308,10 +196177,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195319,13 +196188,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -195335,6 +196204,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -195342,6 +196212,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -195380,16 +196251,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1226 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -195401,10 +196272,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -195418,48 +196289,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -195472,10 +196343,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -195484,13 +196355,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195504,7 +196373,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -195549,8 +196418,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1227 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -195559,21 +196428,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195585,7 +196456,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195593,40 +196464,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -195639,10 +196510,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -195651,11 +196522,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195669,7 +196540,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -195714,8 +196585,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1228 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -195724,22 +196595,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -195836,7 +196707,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195881,8 +196752,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1229 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196003,7 +196874,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196048,8 +196919,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1230 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196086,14 +196957,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196112,22 +196983,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196140,11 +197011,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196154,10 +197025,12 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196215,8 +197088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1231 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196225,11 +197098,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196239,9 +197112,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196262,14 +197133,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -196279,22 +197150,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196308,10 +197179,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196323,7 +197194,7 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -196382,8 +197253,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1232 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196392,11 +197263,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196407,7 +197278,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -196420,7 +197291,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196428,40 +197299,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 32 + LVCA: 128 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196474,11 +197345,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196486,14 +197357,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196551,8 +197422,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1233 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196561,21 +197432,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196594,16 +197465,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -196613,22 +197484,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196642,10 +197513,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196653,11 +197524,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -196671,7 +197544,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -196716,8 +197589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1234 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196726,11 +197599,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196741,8 +197614,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196762,30 +197633,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -196822,12 +197693,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196885,8 +197756,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1235 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196901,7 +197772,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -196929,7 +197800,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -196937,11 +197808,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -196949,9 +197820,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -196988,12 +197859,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197052,8 +197923,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1236 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197072,7 +197943,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -197096,7 +197967,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197104,11 +197975,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197116,9 +197987,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197155,12 +198026,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197219,8 +198090,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1237 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197239,9 +198110,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -197341,7 +198212,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -197386,8 +198257,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1238 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197422,7 +198293,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -197431,7 +198302,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -197449,12 +198320,179 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -197489,12 +198527,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197508,7 +198544,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197553,8 +198589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1239 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197573,11 +198609,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197596,8 +198634,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197605,11 +198643,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197617,9 +198655,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197656,12 +198694,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197720,8 +198756,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1240 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197740,11 +198776,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197756,16 +198794,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -197783,21 +198821,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -197810,7 +198848,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -197824,11 +198862,9 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197842,7 +198878,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197887,8 +198923,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1241 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197911,7 +198947,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197931,7 +198969,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197939,11 +198977,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197951,9 +198989,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197990,10 +199028,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198052,8 +199090,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1242 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198072,7 +199110,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -198098,7 +199136,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -198106,11 +199144,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -198118,9 +199156,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -198157,10 +199195,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198219,8 +199257,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1243 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198239,9 +199277,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -198341,7 +199379,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -198386,8 +199424,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1244 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198408,7 +199446,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -198424,7 +199462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -198433,7 +199471,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -198451,21 +199489,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -198478,7 +199516,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -198494,7 +199532,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198508,7 +199546,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -198553,8 +199591,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1245 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198577,7 +199615,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -198598,37 +199636,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -198647,9 +199685,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198657,12 +199695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -198720,8 +199760,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1246 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198730,23 +199770,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -198765,8 +199803,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -198774,11 +199812,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -198786,16 +199824,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -198814,9 +199852,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198824,12 +199862,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -198842,7 +199882,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -198887,8 +199927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1247 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198897,23 +199937,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -198925,48 +199963,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -198979,11 +200017,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198991,12 +200029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199054,8 +200094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1248 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199064,23 +200104,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -199100,30 +200138,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199160,12 +200198,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199178,7 +200216,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -199223,8 +200261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1249 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199239,13 +200277,13 @@ ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199267,19 +200305,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -199287,10 +200325,10 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199326,13 +200364,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199390,8 +200428,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1250 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199410,7 +200448,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -199434,19 +200472,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -199454,10 +200492,10 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199493,13 +200531,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199557,8 +200595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1251 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199577,9 +200615,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199679,7 +200717,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -199724,8 +200762,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1252 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199746,7 +200784,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199760,7 +200798,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -199787,6 +200825,173 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 128 @@ -199828,8 +201033,6 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -199891,8 +201094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1253 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199916,6 +201119,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -199934,7 +201139,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -199995,8 +201200,6 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -200058,8 +201261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1254 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -200080,9 +201283,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200100,42 +201305,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -200149,10 +201350,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200160,8 +201361,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -200179,8 +201380,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -200225,8 +201426,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1255 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -200235,19 +201436,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -200261,7 +201462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200269,46 +201470,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -200316,10 +201517,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200327,8 +201528,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -200392,31 +201593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1256 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200428,7 +201629,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200436,57 +201637,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200494,12 +201695,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -200512,7 +201713,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -200557,31 +201758,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1257 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -200595,7 +201796,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200603,57 +201804,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200661,11 +201862,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -200724,31 +201925,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1258 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -200762,57 +201963,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -200826,12 +202031,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -200843,8 +202048,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -200889,31 +202094,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1259 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200925,7 +202130,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200941,7 +202146,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -200952,21 +202157,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -200979,7 +202184,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -200993,12 +202198,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -201056,31 +202261,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1260 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201092,15 +202297,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -201108,34 +202313,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201146,10 +202351,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201158,12 +202363,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -201221,33 +202428,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1261 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201266,43 +202471,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201314,10 +202519,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -201325,8 +202530,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -201388,8 +202595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1262 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201398,23 +202605,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201433,7 +202638,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -201452,22 +202657,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -201481,9 +202686,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201492,13 +202697,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -201557,8 +202760,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1263 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201567,10 +202770,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -201579,9 +202782,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201600,8 +202805,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -201613,30 +202818,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201648,9 +202853,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201659,10 +202864,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -201724,8 +202927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1264 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201734,21 +202937,23 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201760,16 +202965,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -201787,21 +202992,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -201814,7 +203019,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -201828,11 +203033,9 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -201891,8 +203094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1265 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201915,7 +203118,9 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201935,7 +203140,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -201947,44 +203152,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201993,13 +203198,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -202058,29 +203263,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1266 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -202101,57 +203306,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202160,12 +203365,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202223,33 +203430,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1267 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202261,54 +203466,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -202316,9 +203521,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202327,12 +203532,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202390,33 +203597,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1268 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202436,56 +203641,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202494,12 +203699,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202557,28 +203762,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1269 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -202602,8 +203807,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -202615,28 +203820,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202650,9 +203855,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202661,10 +203866,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -202726,8 +203929,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1270 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -202736,21 +203939,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202770,40 +203975,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6720 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202818,9 +204023,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -202828,14 +204033,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202848,7 +204053,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -202893,8 +204098,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1271 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -202903,19 +204108,19 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -202937,7 +204142,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -202945,32 +204150,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202984,10 +204189,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -202995,14 +204200,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203060,28 +204265,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1272 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -203103,41 +204308,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203151,10 +204356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203164,9 +204369,11 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -203180,7 +204387,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -203225,33 +204432,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1273 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203270,41 +204475,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203318,10 +204523,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203331,10 +204536,12 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203392,33 +204599,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1274 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203439,14 +204644,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -203456,22 +204661,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203485,10 +204690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203496,14 +204701,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203561,20 +204766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1275 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -203582,7 +204787,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -203597,7 +204802,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -203605,7 +204810,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -203613,26 +204818,26 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -203640,7 +204845,7 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -203651,11 +204856,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203663,13 +204868,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -203728,8 +204933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1276 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -203738,21 +204943,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203764,13 +204969,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -203790,28 +204995,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -203819,10 +205020,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203830,15 +205031,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -203849,8 +205051,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -203895,20 +205097,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1277 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -203916,10 +205118,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203931,13 +205133,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -203951,34 +205153,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -203986,10 +205184,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203997,15 +205195,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204016,8 +205215,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -204062,31 +205261,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1278 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204098,7 +205297,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -204106,57 +205305,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -204164,15 +205363,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204184,7 +205384,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -204229,31 +205429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1279 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204274,7 +205474,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -204294,36 +205494,36 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -204331,8 +205531,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -204340,6 +205540,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204396,15 +205597,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1280 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -204417,7 +205618,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -204438,38 +205639,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204496,12 +205701,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204514,7 +205717,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -204560,8 +205763,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1281 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204576,7 +205779,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -204585,6 +205788,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204596,44 +205801,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204646,7 +205855,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -204660,12 +205869,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204678,8 +205885,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -204724,8 +205931,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1282 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204740,7 +205947,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -204748,7 +205955,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204768,40 +205977,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204815,9 +206024,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -204826,14 +206035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204892,8 +206101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1283 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204902,19 +206111,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -204928,48 +206137,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204982,10 +206191,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -204994,13 +206203,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205060,8 +206267,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1284 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205070,21 +206277,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205103,7 +206312,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -205122,22 +206331,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205151,9 +206360,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205162,11 +206371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205226,8 +206437,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1285 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205236,10 +206447,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -205248,11 +206459,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205264,16 +206473,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -205290,18 +206499,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -205318,10 +206527,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205330,8 +206539,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -205394,8 +206605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1286 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205404,10 +206615,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -205418,9 +206629,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205432,7 +206641,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -205440,7 +206649,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -205448,32 +206657,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205486,10 +206695,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205498,14 +206707,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -205564,8 +206773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1287 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205574,21 +206783,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205600,48 +206809,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205654,10 +206863,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205666,8 +206875,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -205730,8 +206941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1288 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205740,23 +206951,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205775,8 +206984,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -205784,11 +206993,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -205796,9 +207005,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -205835,12 +207044,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205900,8 +207107,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1289 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205920,11 +207127,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205943,7 +207152,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -206004,8 +207213,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -206068,8 +207275,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1290 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206090,9 +207297,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206110,42 +207319,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -206159,10 +207364,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206170,8 +207375,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -206190,8 +207395,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -206236,8 +207441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1291 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206246,17 +207451,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -206272,50 +207477,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206326,11 +207527,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206338,8 +207539,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -206358,8 +207559,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -206404,8 +207605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1292 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206414,21 +207615,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206440,7 +207641,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206448,8 +207649,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -206460,30 +207661,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206494,10 +207695,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -206506,8 +207707,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -206570,8 +207771,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1293 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206580,21 +207781,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -206608,50 +207809,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206662,10 +207859,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -206674,12 +207871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -206692,7 +207891,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -206738,8 +207937,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1294 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206748,23 +207947,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206776,7 +207973,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206796,26 +207993,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206826,11 +208023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206838,14 +208035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -206859,7 +208056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -206904,8 +208101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1295 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206914,21 +208111,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206940,7 +208137,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206948,38 +208145,38 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 784 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206990,11 +208187,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207002,14 +208199,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207023,7 +208220,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -207068,8 +208265,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1296 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -207078,21 +208275,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207110,9 +208307,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -207124,30 +208321,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -207159,9 +208352,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -207170,7 +208363,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -207188,7 +208381,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207234,8 +208427,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1297 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -207244,19 +208437,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -207298,35 +208491,35 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 8 LSPB: 16 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207334,14 +208527,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207400,29 +208593,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1298 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -207442,55 +208635,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207498,14 +208695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207518,7 +208715,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207564,15 +208761,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1299 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -207580,12 +208777,12 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -207600,16 +208797,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -207627,34 +208824,38 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207662,14 +208863,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207682,7 +208881,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207728,15 +208927,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1300 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -207749,10 +208948,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207764,15 +208965,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -207784,41 +208985,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207826,8 +209031,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -207844,7 +209051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207890,33 +209097,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1301 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207928,13 +209133,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -207948,30 +209153,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -207980,9 +209189,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207990,13 +209199,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208010,7 +209219,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -208056,15 +209265,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1302 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -208077,10 +209286,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208092,15 +209301,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -208108,26 +209317,26 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -208135,11 +209344,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208147,10 +209356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208158,13 +209367,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208224,31 +209431,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1303 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208260,14 +209469,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -208287,27 +209496,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208316,9 +209525,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208326,11 +209535,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208390,15 +209601,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1304 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -208411,16 +209622,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -208428,23 +209637,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -208454,28 +209663,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 32 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208483,10 +209688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208494,14 +209699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -208514,8 +209719,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -208560,31 +209765,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1305 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208596,54 +209801,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208651,10 +209856,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208662,13 +209867,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208728,35 +209931,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1306 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 1313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -208764,54 +209969,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208819,10 +210024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208830,12 +210035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -208894,33 +210101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1307 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208932,7 +210137,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -208940,7 +210145,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -208948,38 +210153,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208987,10 +210192,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208998,8 +210203,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -209064,31 +210269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1308 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209126,18 +210331,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -209151,10 +210356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209162,14 +210367,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209228,8 +210433,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1309 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209238,11 +210443,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209256,7 +210461,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209271,43 +210476,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209319,10 +210524,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209330,8 +210535,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -209349,7 +210556,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209394,8 +210601,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1310 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209404,23 +210611,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209438,7 +210643,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -209458,24 +210663,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 - LSPA: 2 + LSPA: 8 LSPB: 32 - LVCA: 128 + LVCA: 32 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209487,10 +210688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209498,14 +210699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209518,8 +210719,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209564,8 +210765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1311 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209574,11 +210775,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209592,7 +210793,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209600,50 +210801,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209654,11 +210851,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209666,14 +210863,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209686,8 +210883,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209732,8 +210929,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1312 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209742,21 +210939,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209774,7 +210971,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -209803,9 +211000,13 @@ LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -209820,9 +211021,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209830,14 +211031,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209850,7 +211051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -209896,8 +211097,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1313 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209906,11 +211107,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209918,13 +211119,13 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209952,7 +211153,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -209967,7 +211168,7 @@ LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1544 + LdsNumElements: 2048 LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -209975,18 +211176,18 @@ LdsOffsetB: 256 LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -209999,7 +211200,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -210019,7 +211220,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210064,29 +211265,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1314 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -210100,7 +211301,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -210117,7 +211318,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -210126,34 +211327,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 520 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -210162,14 +211363,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210183,7 +211384,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210228,31 +211429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1315 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -210273,14 +211474,14 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -210290,34 +211491,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -210326,14 +211527,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210347,7 +211548,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210392,20 +211593,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1316 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -210413,14 +211614,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210428,54 +211629,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -210483,10 +211684,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -210494,14 +211695,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210515,7 +211714,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210560,31 +211759,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1317 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -210596,7 +211797,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -210604,66 +211805,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -210712,6 +211915,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -210728,35 +211932,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1318 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210764,50 +211968,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -210816,24 +212024,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210846,7 +212056,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -210876,6 +212086,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -210892,15 +212103,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1319 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + SolutionIndex: 1326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -210908,19 +212119,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210928,76 +212139,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -211010,7 +212227,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -211040,6 +212257,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -211056,31 +212274,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1320 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + SolutionIndex: 1327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211092,7 +212310,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -211101,7 +212319,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -211119,50 +212337,52 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -211206,6 +212426,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -211222,15 +212443,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1321 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -211243,10 +212464,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -211267,7 +212488,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -211330,8 +212551,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211395,8 +212614,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1322 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211417,9 +212636,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211431,15 +212652,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211451,28 +212672,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211485,11 +212706,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211499,10 +212720,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211520,7 +212739,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -211566,8 +212785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1323 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211576,21 +212795,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211602,15 +212823,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211622,28 +212843,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211656,11 +212877,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211670,14 +212891,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -211737,8 +212956,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1324 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211747,21 +212966,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211773,7 +212994,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -211781,7 +213002,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211793,28 +213014,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211827,11 +213048,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211841,8 +213062,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211906,8 +213127,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1325 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211916,21 +213137,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -211944,16 +213165,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -211970,18 +213191,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -211998,10 +213219,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -212012,8 +213233,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212031,7 +213254,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -212077,8 +213300,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1326 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212087,10 +213310,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -212099,11 +213322,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212122,9 +213343,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212135,28 +213356,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212170,10 +213391,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212183,8 +213404,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212248,8 +213471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1327 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212258,23 +213481,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212293,9 +213514,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212306,28 +213527,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212341,10 +213562,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212354,8 +213575,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212419,8 +213642,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1328 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212429,23 +213652,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212464,9 +213685,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212477,28 +213698,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212512,10 +213733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212525,8 +213746,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212590,8 +213813,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1329 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212600,23 +213823,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212628,7 +213849,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -212637,7 +213858,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212655,21 +213876,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212682,7 +213903,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -212698,11 +213919,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -212717,7 +213938,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -212763,8 +213984,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1330 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212787,7 +214008,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212806,8 +214027,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -212815,11 +214036,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -212827,9 +214048,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -212868,12 +214089,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -212888,7 +214107,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -212934,8 +214153,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1331 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212954,11 +214173,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212977,7 +214198,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -213040,8 +214261,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213059,7 +214278,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -213105,8 +214324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1332 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213130,6 +214349,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213148,7 +214369,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -213211,8 +214432,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213230,7 +214449,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -213276,8 +214495,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1333 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213301,6 +214520,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213312,16 +214533,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -213339,183 +214560,12 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1334 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 2 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -213552,10 +214602,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -213616,8 +214666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1335 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213636,9 +214686,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -213741,7 +214791,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -213787,8 +214837,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1336 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213809,7 +214859,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -213825,16 +214875,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -213851,39 +214901,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -213893,8 +214943,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213912,7 +214964,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -213958,20 +215010,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1337 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -213979,12 +215031,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213996,14 +215046,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -214022,28 +215072,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214051,10 +215101,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214064,11 +215114,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -214129,20 +215181,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1338 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -214150,12 +215202,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214167,7 +215217,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214193,28 +215243,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214222,10 +215272,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214235,11 +215285,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -214300,20 +215350,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1339 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -214321,10 +215371,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -214338,7 +215388,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214346,7 +215396,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -214354,49 +215404,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214406,14 +215456,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214463,6 +215513,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214473,15 +215524,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1340 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214489,15 +215540,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214509,7 +215560,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214517,46 +215568,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214565,9 +215616,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214577,14 +215628,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214634,6 +215685,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214644,15 +215696,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1341 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214660,15 +215712,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214680,65 +215732,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214748,12 +215796,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214766,7 +215816,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -214803,6 +215853,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214813,15 +215864,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1342 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214829,17 +215880,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214851,7 +215900,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214859,42 +215908,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -214905,7 +215954,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -214921,12 +215970,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214987,8 +216036,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1343 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215003,7 +216052,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -215011,7 +216060,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215023,7 +216072,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -215031,42 +216080,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215077,7 +216126,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -215093,12 +216142,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -215159,8 +216208,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1344 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215175,7 +216224,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -215183,7 +216232,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215203,30 +216252,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false @@ -215261,12 +216310,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -215327,8 +216376,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1345 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215343,13 +216392,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -215398,15 +216447,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215499,8 +216548,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1346 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215521,7 +216570,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -215535,50 +216584,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215589,7 +216638,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -215604,9 +216653,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -215671,8 +216718,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1347 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215691,11 +216738,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215713,8 +216762,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -215742,11 +216791,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215773,8 +216826,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -215791,7 +216842,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -215839,8 +216890,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1348 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215864,6 +216915,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215882,7 +216935,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -215910,15 +216963,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215945,8 +216998,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -216011,8 +217062,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1349 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216036,6 +217087,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216054,43 +217107,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3088 + LdsNumElements: 3616 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216103,9 +217156,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216115,12 +217168,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216134,7 +217189,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -216181,8 +217236,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1350 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216191,23 +217246,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216219,50 +217272,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216273,11 +217326,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216287,12 +217340,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216306,7 +217361,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -216353,8 +217408,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1351 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216363,23 +217418,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216397,44 +217450,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216447,9 +217496,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216459,12 +217508,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216477,7 +217528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -216525,8 +217576,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1352 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216535,13 +217586,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -216550,8 +217601,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216571,42 +217620,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216632,13 +217681,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216652,7 +217701,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -216699,8 +217748,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1353 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216715,11 +217764,11 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -216743,42 +217792,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 4 - LSPB: 32 + LSPB: 64 LVCA: 64 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216790,10 +217839,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216804,13 +217853,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216824,7 +217873,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -216871,8 +217920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1354 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216881,19 +217930,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -216915,38 +217964,38 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216958,10 +218007,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216972,13 +218021,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -217039,8 +218088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1355 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217049,19 +218098,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -217083,8 +218132,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -217095,28 +218144,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -217130,10 +218179,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -217144,7 +218193,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -217164,7 +218213,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -217211,8 +218260,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1356 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217221,17 +218270,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -217255,7 +218304,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -217263,11 +218312,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -217275,9 +218324,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -217316,12 +218365,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -217383,8 +218432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1357 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217403,7 +218452,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -217419,15 +218468,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -217435,28 +218484,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -217469,7 +218522,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -217484,13 +218537,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -217503,7 +218556,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -217551,8 +218604,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1358 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217571,11 +218624,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -217622,15 +218675,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217723,8 +218776,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1359 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217794,15 +218847,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217895,8 +218948,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1360 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1366 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217931,7 +218984,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -217940,7 +218993,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -217958,23 +219011,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217985,7 +219038,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -218001,11 +219054,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218067,8 +219120,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1361 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1367 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218091,7 +219144,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218110,8 +219163,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218119,11 +219172,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218131,22 +219184,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218172,12 +219225,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218192,7 +219243,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -218239,8 +219290,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1362 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1368 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218259,11 +219310,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218282,8 +219335,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218291,11 +219344,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218303,22 +219356,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218344,12 +219397,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218411,8 +219462,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1363 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1369 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218431,11 +219482,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218447,16 +219500,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -218474,23 +219527,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218501,7 +219554,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -218517,11 +219570,9 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218583,8 +219634,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1364 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1370 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218605,9 +219656,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218627,7 +219680,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218635,11 +219688,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218647,9 +219700,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -218688,10 +219741,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218753,8 +219806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1365 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1371 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218773,9 +219826,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -218799,7 +219852,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218807,11 +219860,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218819,22 +219872,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218860,10 +219913,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218925,8 +219978,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1366 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1372 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218945,9 +219998,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -218970,37 +220023,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -219019,9 +220072,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219031,12 +220084,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219097,8 +220152,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1367 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1373 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219107,23 +220162,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219142,8 +220195,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219151,11 +220204,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -219163,16 +220216,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -219191,9 +220244,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219203,12 +220256,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219269,8 +220324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1368 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1374 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219279,23 +220334,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219315,7 +220368,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219323,11 +220376,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -219335,22 +220388,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219363,9 +220416,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219375,12 +220428,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219394,7 +220447,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -219441,8 +220494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1369 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1375 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219451,17 +220504,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -219486,31 +220539,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -219549,12 +220602,10 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219615,8 +220666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1370 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1376 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219631,7 +220682,7 @@ ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -219640,6 +220691,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219659,7 +220712,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219667,34 +220720,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219706,10 +220759,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219719,15 +220772,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -219740,7 +220793,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -219787,28 +220840,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1371 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1377 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -219831,7 +220884,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219839,34 +220892,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219878,10 +220931,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219891,13 +220944,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -219957,28 +221010,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1372 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1378 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -220001,10 +221054,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -220021,24 +221074,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 32 + LVCA: 16 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220050,10 +221099,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220063,13 +221112,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -220081,7 +221130,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -220129,29 +221178,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1373 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1379 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -220175,19 +221224,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -220195,22 +221244,22 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1824 + LdsNumElements: 1680 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 192 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220223,9 +221272,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220235,13 +221284,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 @@ -220256,7 +221305,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -220303,8 +221352,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1374 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1380 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220313,19 +221362,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -220346,9 +221395,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -220368,21 +221417,21 @@ LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 4 + LVCB: 2 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1824 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220395,9 +221444,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220407,8 +221456,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -220473,8 +221524,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1375 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1381 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220483,11 +221534,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -220495,11 +221546,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -220517,9 +221566,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -220527,11 +221576,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -220539,18 +221588,22 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220563,9 +221616,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220575,12 +221628,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220593,8 +221646,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -220641,8 +221694,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1376 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1382 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220651,19 +221704,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -220680,14 +221733,14 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -220695,32 +221748,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 1 LSPB: 16 - LVCA: 32 + LVCA: 128 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1680 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 192 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -220729,15 +221778,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220747,14 +221796,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220767,7 +221816,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -220815,8 +221864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1377 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + SolutionIndex: 1383 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220825,19 +221874,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -220852,64 +221901,60 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220921,12 +221966,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220939,8 +221984,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -220987,8 +222032,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1378 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1384 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220997,19 +222042,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -221024,64 +222069,60 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221092,10 +222133,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 @@ -221109,8 +222152,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -221157,8 +222200,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1379 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + SolutionIndex: 1385 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -221167,23 +222210,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221195,61 +222236,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221259,15 +222300,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221280,7 +222321,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -221327,31 +222368,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1380 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + SolutionIndex: 1386 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221363,61 +222404,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1312 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221427,15 +222472,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221447,7 +222492,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -221495,31 +222540,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1381 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1387 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221532,60 +222577,64 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 - LVCB: 8 - LVPA: 1 - LVPB: 16 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1312 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221595,15 +222644,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221615,7 +222662,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -221663,31 +222710,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1382 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1388 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221699,14 +222748,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -221725,18 +222774,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -221749,7 +222802,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 96 MacroTile1: 64 MacroTileA: 96 @@ -221765,12 +222818,10 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -221783,8 +222834,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -221831,8 +222882,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1383 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1389 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -221853,9 +222904,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221867,14 +222920,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -221893,22 +222946,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -221921,11 +222974,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221937,9 +222990,7 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -222003,8 +223054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1384 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1390 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -222013,11 +223064,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -222025,9 +223076,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222048,14 +223101,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -222065,22 +223118,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -222094,10 +223147,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222111,7 +223164,7 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -222173,8 +223226,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1385 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1391 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -222183,11 +223236,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -222195,10 +223248,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -222211,15 +223264,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222227,38 +223280,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222266,10 +223319,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222279,11 +223332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -222345,33 +223400,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1386 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1392 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222383,7 +223436,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -222391,7 +223444,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222399,38 +223452,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222439,9 +223492,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222451,12 +223504,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222517,33 +223570,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1387 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1393 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 32 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222555,54 +223608,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222610,10 +223663,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222623,12 +223676,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222689,33 +223744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1388 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1394 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222727,7 +223780,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -222735,7 +223788,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222743,38 +223796,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222782,9 +223835,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -222795,14 +223848,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222863,31 +223916,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1389 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1395 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222899,16 +223952,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -222919,34 +223972,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222954,9 +224007,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -222967,8 +224020,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -223033,33 +224088,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1390 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1396 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223080,14 +224133,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -223100,9 +224153,9 @@ LSCA: 8 LSCB: 32 LSPA: 32 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 32 + LVCB: 8 LVPA: 32 LVPB: 8 LdcEqualsLdd: false @@ -223142,11 +224195,11 @@ NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223207,8 +224260,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1391 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1397 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223229,7 +224282,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -223252,39 +224305,39 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223298,9 +224351,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223311,14 +224364,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223379,8 +224432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1392 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 1398 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223389,10 +224442,10 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -223401,7 +224454,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -223422,41 +224475,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 32 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223470,9 +224523,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223483,14 +224536,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223551,8 +224602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1393 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1399 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223561,21 +224612,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223594,8 +224647,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -223607,28 +224660,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223642,9 +224695,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223655,10 +224708,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -223723,8 +224774,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1394 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 1400 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223733,21 +224784,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223759,7 +224812,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -223768,15 +224821,15 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -223786,21 +224839,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 16 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223813,11 +224866,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -223827,14 +224880,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223895,8 +224948,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1395 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1401 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223905,11 +224958,11 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -223919,7 +224972,7 @@ WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223938,9 +224991,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -223951,28 +225004,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223986,10 +225039,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -223999,12 +225052,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -224065,20 +225120,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1396 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + SolutionIndex: 1402 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -224086,12 +225141,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224110,41 +225163,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -224158,10 +225211,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224171,11 +225224,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -224237,20 +225292,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1397 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + SolutionIndex: 1403 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -224258,12 +225313,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224275,15 +225328,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -224292,31 +225345,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -224329,11 +225382,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224343,14 +225396,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -224411,31 +225462,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1398 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1404 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224456,7 +225509,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -224467,7 +225520,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -224476,15 +225529,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -224503,9 +225556,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224515,7 +225568,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 @@ -224583,8 +225636,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1399 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1405 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224593,11 +225646,11 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -224626,9 +225679,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -224639,7 +225692,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -224648,15 +225701,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -224675,9 +225728,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224687,10 +225740,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -224755,8 +225806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1400 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + SolutionIndex: 1406 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224765,11 +225816,11 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -224777,9 +225828,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224791,42 +225844,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -224845,11 +225898,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224859,11 +225912,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -224925,8 +225980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1401 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1407 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224935,23 +225990,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224963,7 +226016,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -224971,42 +226024,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225017,10 +226070,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225031,13 +226084,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225099,8 +226152,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1402 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1408 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225109,21 +226162,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225142,8 +226195,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225151,34 +226204,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225190,9 +226243,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225203,8 +226256,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -225269,8 +226324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1403 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1409 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225279,23 +226334,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225307,50 +226360,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225361,7 +226414,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -225376,12 +226429,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225443,8 +226494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1404 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 1410 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225459,15 +226510,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225479,16 +226532,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -225506,21 +226559,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -225533,7 +226586,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -225549,11 +226602,9 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225615,8 +226666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1405 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1411 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225639,7 +226690,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225658,8 +226711,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225667,34 +226720,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225706,9 +226759,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225719,10 +226772,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -225783,12 +226834,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1406 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1412 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225797,21 +226850,23 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225831,7 +226886,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225839,34 +226894,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225878,9 +226933,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225891,7 +226946,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -225953,12 +227008,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1407 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1413 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225967,19 +227024,19 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -226125,12 +227182,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1408 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1414 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -226167,15 +227226,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -226183,38 +227242,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -226222,10 +227281,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226235,11 +227294,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -226254,7 +227315,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -226303,33 +227364,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1409 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + SolutionIndex: 1415 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226341,15 +227400,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -226357,38 +227416,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -226396,10 +227455,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226409,11 +227468,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -226477,33 +227538,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1410 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + SolutionIndex: 1416 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226515,16 +227574,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -226541,39 +227600,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226583,8 +227642,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -226651,20 +227712,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1411 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1417 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -226672,12 +227733,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226696,7 +227755,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -226759,8 +227818,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -226827,8 +227884,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1412 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1418 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -226852,6 +227909,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226870,7 +227929,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -226933,8 +227992,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -227001,8 +228058,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1413 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1419 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227023,9 +228080,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -227044,7 +228103,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -227060,25 +228119,26 @@ GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -227092,9 +228152,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -227105,21 +228165,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227175,8 +228235,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1414 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1420 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227185,10 +228245,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -227200,6 +228260,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -227219,7 +228281,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -227227,21 +228289,22 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -227280,18 +228343,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227347,8 +228412,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1415 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1421 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227367,7 +228432,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -227408,6 +228473,7 @@ GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly @@ -227464,8 +228530,10 @@ PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227521,8 +228589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1416 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1422 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227543,13 +228611,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227580,7 +228648,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227623,7 +228691,6 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 @@ -227698,8 +228765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1417 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1423 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227714,7 +228781,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -227726,7 +228793,7 @@ fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227744,7 +228811,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -227752,12 +228819,12 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227765,9 +228832,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -227800,16 +228867,15 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -227826,7 +228892,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -227875,8 +228941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1418 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 1424 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227891,19 +228957,19 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227934,7 +229000,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227949,15 +229015,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -227977,7 +229043,6 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 @@ -228052,8 +229117,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1419 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1425 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228068,7 +229133,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -228080,7 +229145,7 @@ fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228090,23 +229155,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -228118,23 +229183,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228145,11 +229210,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228158,12 +229223,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228228,8 +229295,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1420 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1426 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW1_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228238,11 +229305,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -228252,11 +229319,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228273,7 +229338,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -228287,7 +229352,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -228302,15 +229367,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228336,6 +229401,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -228355,7 +229422,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -228404,8 +229471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1421 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1427 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228420,7 +229487,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -228429,10 +229496,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228449,9 +229514,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -228463,30 +229528,30 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 32 - LVCB: 4 + LVCA: 16 + LVCB: 2 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228498,9 +229563,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -228512,11 +229577,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -228531,7 +229598,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -228580,33 +229647,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1422 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1428 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -228626,7 +229691,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -228634,33 +229699,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -228674,10 +229739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228686,14 +229751,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228758,35 +229823,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1423 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW1_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1429 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -228794,51 +229859,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 128 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2832 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228849,11 +229910,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 48 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228862,14 +229923,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228884,7 +229945,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -228934,31 +229995,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1424 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1430 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x48x16_SE_AMAS1_DTL0_EPS0_GRVW1_LPB1_NLCA1_PBD0_PGR0_PLR0_TT4_6_USFGRO1_VW1_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -228979,7 +230040,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -228997,22 +230058,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -229026,9 +230087,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -229046,7 +230107,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -229061,7 +230122,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -229110,14 +230171,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1425 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1431 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] @@ -229131,8 +230192,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -229146,16 +230207,17 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -229174,23 +230236,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -229201,11 +230263,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229214,13 +230276,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -229244,6 +230306,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229267,6 +230330,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229286,20 +230350,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1426 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG32_8_1_WGM64 + SolutionIndex: 1432 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -229307,14 +230371,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -229326,24 +230390,25 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -229351,18 +230416,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2832 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -229375,9 +230444,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 48 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 48 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229386,14 +230455,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -229408,14 +230475,15 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229439,6 +230507,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229458,31 +230527,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1427 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x48x16_SE_AMAS1_DTL0_EPS0_GRVW1_LPB1_NLCA1_PBD0_PGR0_PLR0_TT4_6_USFGRO1_VW1_WG32_8_1_WGM1 + SolutionIndex: 1433 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -229494,16 +230565,17 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -229514,35 +230586,35 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -229550,10 +230622,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229562,10 +230634,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -229592,6 +230662,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229615,6 +230686,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229634,31 +230706,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1428 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG32_8_1_WGM64 + SolutionIndex: 1434 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -236130,6436 +237204,6448 @@ - [714, 7844.54] - - [1225, 64, 64, 256] - [721, 8721.52] + - - [65, 6400, 1, 1024] + - [722, 2839.89] + - - [256, 6400, 1, 4096] + - [723, 7361.66] + - - [1024, 64, 1, 4096] + - [724, 3787.18] - - [704, 1024, 1, 128] - - [824, 3019.46] + - [827, 3019.46] - - [1024, 1024, 1, 3328] - - [862, 8162.55] + - [865, 8162.55] - - [4, 704, 1, 1280] - - [765, 319.546] + - [768, 319.546] - - [4, 1856, 1, 3328] - - [795, 550.514] + - [798, 550.514] - - [1856, 448, 1, 3328] - - [847, 6813.05] + - [850, 6813.05] - - [2944, 4288, 1, 1280] - - [856, 8975.76] + - [859, 8975.76] - - [2368, 64, 1, 3328] - - [770, 5482.23] + - [773, 5482.23] - - [1760, 32, 1, 1760] - - [809, 3859.94] + - [812, 3859.94] - - [2368, 5888, 1, 256] - - [853, 8656.73] + - [856, 8656.73] - - [5888, 1856, 1, 256] - - [843, 7881.43] + - [846, 7881.43] - - [64, 3584, 1, 1280] - - [779, 4835.33] + - [782, 4835.33] - - [512, 24000, 1, 1536] - - [850, 8665.9] + - [853, 8665.9] - - [128, 6784, 1, 3328] - - [847, 7062.25] + - [850, 7062.25] - - [5888, 1408, 1, 256] - - [860, 8130.22] + - [863, 8130.22] - - [5888, 1856, 1, 3328] - - [850, 8840.75] + - [853, 8840.75] - - [512, 4, 1, 512] - - [735, 170.223] + - [738, 170.223] - - [35, 1500, 1, 2560] - - [739, 2896.55] + - [742, 2896.55] - - [1856, 4288, 1, 256] - - [839, 8374.63] + - [842, 8374.63] - - [1024, 5056, 1, 128] - - [836, 3304.25] + - [839, 3304.25] - - [5056, 5056, 1, 3328] - - [850, 8905.43] + - [853, 8905.43] - - [1408, 5888, 1, 1280] - - [850, 9418.1] + - [853, 9418.1] - - [2368, 448, 1, 128] - - [824, 3074.97] + - [827, 3074.97] - - [6144, 6000, 1, 2560] - - [850, 9336.33] + - [853, 9336.33] - - [2368, 6784, 1, 128] - - [823, 4919.26] + - [826, 4919.26] - - [1024, 3584, 1, 3328] - - [841, 8071.07] + - [844, 8071.07] - - [512, 48000, 1, 2048] - - [850, 8763.06] + - [853, 8763.06] - - [1408, 64, 1, 128] - - [746, 805.47] + - [749, 805.47] - - [256, 4288, 1, 3328] - - [872, 6331.86] + - [875, 6331.86] - - [5888, 1408, 1, 1280] - - [840, 9226.17] + - [843, 9226.17] - - [704, 1856, 1, 3328] - - [866, 6309.4] + - [869, 6309.4] - - [1408, 4288, 1, 256] - - [850, 8374.5] + - [853, 8374.5] - - [1024, 2368, 1, 256] - - [847, 7341.02] + - [850, 7341.02] - - [64, 4, 1, 256] - - [790, 13.0032] + - [793, 13.0032] - - [1408, 1856, 1, 1280] - - [857, 8772.95] + - [860, 8772.95] - - [1408, 64, 1, 1280] - - [803, 4049.98] + - [806, 4049.98] - - [448, 1024, 1, 1280] - - [866, 6071.16] + - [869, 6071.16] - - [4096, 32, 1, 4096] - - [800, 5491.72] + - [803, 5491.72] - - [256, 1408, 1, 3328] - - [852, 5351.39] + - [855, 5351.39] - - [5056, 5056, 1, 1280] - - [860, 9408.57] + - [863, 9408.57] - - [448, 5056, 1, 256] - - [865, 6680.44] + - [868, 6680.44] - - [704, 1856, 1, 1280] - - [842, 7503.93] + - [845, 7503.93] - - [128, 5056, 1, 128] - - [757, 2316.48] + - [760, 2316.48] - - [2368, 128, 1, 256] - - [842, 3660.12] + - [845, 3660.12] - - [1856, 1408, 1, 128] - - [829, 3885.87] + - [832, 3885.87] - - [64, 5056, 1, 256] - - [852, 3318.81] + - [855, 3318.81] - - [6784, 256, 1, 3328] - - [850, 7590.54] + - [853, 7590.54] - - [1408, 3584, 1, 256] - - [839, 8276.82] + - [842, 8276.82] - - [4288, 448, 1, 256] - - [852, 7139.69] + - [855, 7139.69] - - [64, 704, 1, 128] - - [753, 375.467] + - [756, 375.467] - - [1024, 1856, 1, 128] - - [822, 2890.56] + - [825, 2890.56] - - [4288, 2944, 1, 1280] - - [856, 8981.35] + - [859, 8981.35] - - [704, 5056, 1, 1280] - - [842, 7684.62] + - [845, 7684.62] - - [2368, 704, 1, 3328] - - [857, 7070.04] + - [860, 7070.04] - - [256, 5888, 1, 256] - - [842, 7319.35] + - [845, 7319.35] - - [1856, 4288, 1, 3328] - - [840, 9238.59] + - [843, 9238.59] - - [256, 2944, 1, 256] - - [842, 6090.21] + - [845, 6090.21] - - [5888, 1024, 1, 256] - - [846, 8269.95] + - [849, 8269.95] - - [448, 64, 1, 1280] - - [799, 2493.22] + - [802, 2493.22] - - [3072, 64, 1, 1024] - - [782, 3149.67] + - [785, 3149.67] - - [3584, 4, 1, 1280] - - [884, 567.762] + - [887, 567.762] - - [2560, 16, 1, 2560] - - [791, 2887.05] + - [794, 2887.05] - - [2944, 64, 1, 256] - - [782, 2565.66] + - [785, 2565.66] - - [128, 4, 1, 1280] - - [885, 78.7692] + - [888, 78.7692] - - [1408, 2944, 1, 256] - - [846, 8337.2] + - [849, 8337.2] - - [256, 1856, 1, 1280] - - [872, 6267.25] + - [875, 6267.25] - - [6784, 5056, 1, 3328] - - [856, 9423.9] + - [859, 9423.9] - - [5056, 5056, 1, 256] - - [843, 8758.23] + - [846, 8758.23] - - [128, 256, 1, 256] - - [798, 1205.26] + - [801, 1205.26] - - [64, 1024, 1, 1280] - - [809, 3566.58] + - [812, 3566.58] - - [2944, 4, 1, 256] - - [762, 319.349] + - [765, 319.349] - - [704, 5056, 1, 128] - - [831, 4073.73] + - [834, 4073.73] - - [4, 2368, 1, 1280] - - [790, 496.892] + - [793, 496.892] - - [2368, 2944, 1, 1280] - - [839, 9085.45] + - [842, 9085.45] - - [448, 448, 1, 3328] - - [817, 5428.66] + - [820, 5428.66] - - [6784, 6784, 1, 1280] - - [856, 8726.93] + - [859, 8726.93] - - [1024, 256, 1, 3328] - - [866, 5499.32] + - [869, 5499.32] - - [1408, 4288, 1, 1280] - - [840, 9094.32] + - [843, 9094.32] - - [3584, 4288, 1, 1280] - - [843, 8703.78] + - [846, 8703.78] - - [512, 6000, 1, 2560] - - [846, 8474.46] + - [849, 8474.46] - - [2368, 704, 1, 1280] - - [852, 7651.49] + - [855, 7651.49] - - [5056, 4288, 1, 3328] - - [860, 8545.25] + - [863, 8545.25] - - [3584, 2368, 1, 3328] - - [848, 8797.78] + - [851, 8797.78] - - [5888, 6784, 1, 1280] - - [846, 8785.08] + - [849, 8785.08] - - [64, 704, 1, 1280] - - [769, 2783.38] + - [772, 2783.38] - - [4288, 256, 1, 256] - - [842, 6162.68] + - [845, 6162.68] - - [2944, 128, 1, 128] - - [744, 1951.23] + - [747, 1951.23] - - [6144, 32, 1, 2560] - - [803, 4588.95] + - [806, 4588.95] - - [6784, 448, 1, 1280] - - [847, 8674.21] + - [850, 8674.21] - - [2944, 5888, 1, 256] - - [860, 8991.66] + - [863, 8991.66] - - [64, 64, 1, 1280] - - [820, 712.348] + - [823, 712.348] - - [4288, 2944, 1, 256] - - [856, 8678.04] + - [859, 8678.04] - - [5888, 704, 1, 1280] - - [846, 8652.61] + - [849, 8652.61] - - [5056, 4, 1, 3328] - - [762, 650.672] + - [765, 650.672] - - [1856, 64, 1, 1280] - - [779, 4471.87] + - [782, 4471.87] - - [1760, 16, 1, 1760] - - [819, 2592.13] + - [822, 2592.13] - - [448, 5888, 1, 128] - - [829, 3822.93] + - [832, 3822.93] - - [5888, 64, 1, 3328] - - [811, 6013.12] + - [814, 6013.12] - - [2944, 256, 1, 3328] - - [852, 7791.35] + - [855, 7791.35] - - [1024, 64, 1, 128] - - [753, 592.416] + - [756, 592.416] - - [5056, 2368, 1, 1280] - - [839, 9260.43] + - [842, 9260.43] - - [448, 3584, 1, 1280] - - [860, 6771.24] + - [863, 6771.24] - - [6784, 5888, 1, 256] - - [854, 7933.29] + - [857, 7933.29] - - [64, 1024, 1, 3328] - - [803, 4782.98] + - [806, 4782.98] - - [704, 128, 1, 1280] - - [809, 3971.88] + - [812, 3971.88] - - [4, 3584, 1, 128] - - [878, 59.4238] + - [881, 59.4238] - - [1408, 448, 1, 1280] - - [852, 5902.07] + - [855, 5902.07] - - [1024, 1408, 1, 256] - - [847, 5272.84] + - [850, 5272.84] - - [2368, 2368, 1, 3328] - - [852, 8488.66] + - [855, 8488.66] - - [1856, 6784, 1, 128] - - [829, 4742.41] + - [832, 4742.41] - - [5056, 704, 1, 3328] - - [855, 7772.38] + - [858, 7772.38] - - [1408, 1856, 1, 256] - - [873, 5229.74] + - [876, 5229.74] - - [1408, 704, 1, 3328] - - [873, 6954.83] + - [876, 6954.83] - - [2368, 5056, 1, 256] - - [846, 8580.58] + - [849, 8580.58] - - [1408, 256, 1, 1280] - - [872, 4790.01] + - [875, 4790.01] - - [3072, 128, 1, 1024] - - [868, 4579.77] + - [871, 4579.77] - - [3584, 2368, 1, 1280] - - [839, 8675.03] + - [842, 8675.03] - - [4288, 64, 1, 3328] - - [818, 5550.01] + - [821, 5550.01] - - [2368, 4, 1, 1280] - - [884, 537.418] + - [887, 537.418] - - [704, 5888, 1, 256] - - [840, 5305.78] + - [843, 5305.78] - - [6784, 2944, 1, 128] - - [836, 4344.11] + - [839, 4344.11] - - [6784, 64, 1, 256] - - [866, 4496.32] + - [869, 4496.32] - - [2944, 256, 1, 256] - - [852, 6553.6] + - [855, 6553.6] - - [2944, 6784, 1, 3328] - - [840, 8895.66] + - [843, 8895.66] - - [128, 1, 1, 1408] - - [820, 25.6] + - [823, 25.6] - - [704, 1408, 1, 3328] - - [854, 7913.11] + - [857, 7913.11] - - [3584, 704, 1, 3328] - - [839, 7526.33] + - [842, 7526.33] - - [2944, 256, 1, 128] - - [823, 2830.66] + - [826, 2830.66] - - [6784, 4, 1, 1280] - - [880, 645.135] + - [883, 645.135] - - [1024, 64, 1, 1280] - - [778, 3013.15] + - [781, 3013.15] - - [8448, 4, 1, 2816] - - [730, 984.668] + - [733, 984.668] - - [448, 4288, 1, 256] - - [852, 7139.69] + - [855, 7139.69] - - [64, 3584, 1, 3328] - - [776, 5683.17] + - [779, 5683.17] - - [704, 2368, 1, 1280] - - [860, 7045.2] + - [863, 7045.2] - - [1856, 2368, 1, 1280] - - [857, 8327.8] + - [860, 8327.8] - - [2368, 128, 1, 3328] - - [793, 6082.55] + - [796, 6082.55] - - [64, 193600, 1, 64] - - [842, 6747.67] + - [845, 6747.67] - - [1760, 128, 1, 1760] - - [770, 5512.97] + - [773, 5512.97] - - [448, 1408, 1, 256] - - [852, 5591.44] + - [855, 5591.44] - - [1856, 4288, 1, 1280] - - [850, 8647.62] + - [853, 8647.62] - - [64, 5056, 1, 3328] - - [810, 6096.49] + - [813, 6096.49] - - [512, 1500, 1, 2816] - - [852, 7879.2] + - [855, 7879.2] - - [1024, 448, 1, 128] - - [824, 1844.23] + - [827, 1844.23] - - [704, 4, 1, 1280] - - [790, 341.333] + - [793, 341.333] - - [704, 256, 1, 128] - - [824, 1001.24] + - [827, 1001.24] - - [256, 193600, 1, 64] - - [860, 8113.2] + - [863, 8113.2] - - [704, 2944, 1, 128] - - [831, 3747.03] + - [834, 3747.03] - - [1408, 1024, 1, 1280] - - [857, 7080.61] + - [860, 7080.61] - - [704, 6784, 1, 256] - - [875, 6630.37] + - [878, 6630.37] - - [6784, 704, 1, 256] - - [842, 8005.76] + - [845, 8005.76] - - [5056, 1408, 1, 128] - - [833, 4303.03] + - [836, 4303.03] - - [2048, 7000, 1, 2048] - - [850, 9269.1] + - [853, 9269.1] - - [256, 3584, 1, 3328] - - [844, 7334.38] + - [847, 7334.38] - - [5056, 704, 1, 256] - - [852, 7954.02] + - [855, 7954.02] - - [128, 1408, 1, 128] - - [747, 1242.92] + - [750, 1242.92] - - [3584, 4288, 1, 3328] - - [876, 7683.71] + - [879, 7683.71] - - [5888, 1856, 1, 1280] - - [840, 8831.24] + - [843, 8831.24] - - [256, 1408, 1, 256] - - [842, 4352.58] + - [845, 4352.58] - - [5056, 64, 1, 1280] - - [809, 5011.95] + - [812, 5011.95] - - [1024, 704, 1, 256] - - [842, 5710.07] + - [845, 5710.07] - - [64, 256, 1, 128] - - [748, 149.797] + - [751, 149.797] - - [2368, 3584, 1, 1280] - - [850, 8609.58] + - [853, 8609.58] - - [1024, 256, 1, 256] - - [866, 3276.8] + - [869, 3276.8] - - [1856, 4, 1, 1280] - - [764, 497.004] + - [767, 497.004] - - [448, 448, 1, 256] - - [852, 3117.73] + - [855, 3117.73] - - [2944, 3584, 1, 3328] - - [840, 8879.35] + - [843, 8879.35] - - [7680, 32, 1, 2560] - - [810, 5310.14] + - [813, 5310.14] - - [128, 4288, 1, 128] - - [750, 2116.1] + - [753, 2116.1] - - [256, 256, 1, 3328] - - [803, 4774.6] + - [806, 4774.6] - - [128, 1024, 1, 3328] - - [804, 5894.7] + - [807, 5894.7] - - [4, 1408, 1, 3328] - - [795, 552.574] + - [798, 552.574] - - [6784, 2944, 1, 256] - - [858, 8271.08] + - [861, 8271.08] - - [64, 1856, 1, 1280] - - [809, 4167.86] + - [812, 4167.86] - - [64, 1024, 1, 128] - - [743, 589.088] + - [746, 589.088] - - [1024, 1500, 1, 2560] - - [847, 8407.78] + - [850, 8407.78] - - [1856, 2368, 1, 256] - - [842, 8092.05] + - [845, 8092.05] - - [3584, 256, 1, 128] - - [825, 2607.47] + - [828, 2607.47] - - [3584, 6784, 1, 3328] - - [859, 8558.73] + - [862, 8558.73] - - [256, 1024, 1, 256] - - [852, 3901.68] + - [855, 3901.68] - - [4, 6784, 1, 3328] - - [790, 662.475] + - [793, 662.475] - - [1024, 5888, 1, 3328] - - [850, 9161.66] + - [853, 9161.66] - - [1024, 128, 1, 1280] - - [807, 3942.02] + - [810, 3942.02] - - [3072, 32, 1, 1024] - - [784, 2840.39] + - [787, 2840.39] - - [6144, 24000, 1, 2560] - - [840, 7605.77] + - [843, 7605.77] - - [448, 1024, 1, 256] - - [842, 5062.09] + - [845, 5062.09] - - [5056, 4288, 1, 1280] - - [850, 9090.89] + - [853, 9090.89] - - [5888, 64, 1, 256] - - [852, 4449.68] + - [855, 4449.68] - - [1856, 256, 1, 1280] - - [866, 5834.36] + - [869, 5834.36] - - [64, 5888, 1, 3328] - - [804, 6152.34] + - [807, 6152.34] - - [2368, 2368, 1, 1280] - - [844, 8594.56] + - [847, 8594.56] - - [2944, 5888, 1, 128] - - [829, 4776.09] + - [832, 4776.09] - - [704, 5888, 1, 1280] - - [844, 8435.81] + - [847, 8435.81] - - [2368, 3584, 1, 128] - - [826, 4590.61] + - [829, 4590.61] - - [1856, 5056, 1, 128] - - [837, 4503.38] + - [840, 4503.38] - - [4608, 1, 1, 1536] - - [735, 226.855] + - [738, 226.855] - - [448, 256, 1, 3328] - - [779, 5415.46] + - [782, 5415.46] - - [2944, 6784, 1, 1280] - - [863, 8385.01] + - [866, 8385.01] - - [448, 1856, 1, 128] - - [833, 2618.86] + - [836, 2618.86] - - [128, 1024, 1, 128] - - [742, 940.427] + - [745, 940.427] - - [7680, 4, 1, 2560] - - [766, 985.004] + - [769, 985.004] - - [1024, 704, 1, 1280] - - [852, 7204.46] + - [855, 7204.46] - - [128, 5888, 1, 256] - - [842, 6313.42] + - [845, 6313.42] - - [1024, 5056, 1, 1280] - - [847, 8979.66] + - [850, 8979.66] - - [4288, 1024, 1, 256] - - [839, 7198.19] + - [842, 7198.19] - - [2944, 2368, 1, 128] - - [824, 4624.47] + - [827, 4624.47] - - [704, 704, 1, 3328] - - [865, 5870.61] + - [868, 5870.61] - - [704, 1408, 1, 1280] - - [854, 7680.22] + - [857, 7680.22] - - [5888, 448, 1, 1280] - - [842, 7718.56] + - [845, 7718.56] - - [3584, 256, 1, 3328] - - [847, 7523.78] + - [850, 7523.78] - - [704, 5888, 1, 3328] - - [852, 8196.89] + - [855, 8196.89] - - [704, 1856, 1, 128] - - [830, 3388.33] + - [833, 3388.33] - - [128, 3584, 1, 3328] - - [804, 6626.4] + - [807, 6626.4] - - [4, 4288, 1, 128] - - [877, 159.548] + - [880, 159.548] - - [128, 704, 1, 1280] - - [767, 4038.63] + - [770, 4038.63] - - [3584, 2944, 1, 256] - - [840, 7685.89] + - [843, 7685.89] - - [1856, 128, 1, 3328] - - [796, 6070.53] + - [799, 6070.53] - - [1856, 2368, 1, 3328] - - [857, 8460.52] + - [860, 8460.52] - - [512, 6000, 1, 2816] - - [860, 9019.45] + - [863, 9019.45] - - [2944, 448, 1, 128] - - [823, 3027.63] + - [826, 3027.63] - - [64, 193600, 1, 256] - - [866, 7080.22] + - [869, 7080.22] - - [128, 2944, 1, 1280] - - [842, 5397.77] + - [845, 5397.77] - - [448, 2944, 1, 1280] - - [852, 6996.87] + - [855, 6996.87] - - [512, 24000, 1, 2048] - - [860, 8832.57] + - [863, 8832.57] - - [128, 256, 1, 3328] - - [799, 3531.47] + - [802, 3531.47] - - [1408, 5056, 1, 3328] - - [855, 7969.84] + - [858, 7969.84] - - [1856, 1856, 1, 3328] - - [842, 8140.24] + - [845, 8140.24] - - [3584, 128, 1, 256] - - [852, 4860.95] + - [855, 4860.95] - - [448, 1408, 1, 3328] - - [842, 6353.65] + - [845, 6353.65] - - [2368, 2368, 1, 256] - - [856, 8369.27] + - [859, 8369.27] - - [4288, 4288, 1, 1280] - - [846, 8666.42] + - [849, 8666.42] - - [64, 448, 1, 1280] - - [799, 2591.82] + - [802, 2591.82] - - [5888, 1024, 1, 1280] - - [839, 8526.5] + - [842, 8526.5] - - [704, 1024, 1, 256] - - [852, 4971.7] + - [855, 4971.7] - - [1024, 12544, 1, 256] - - [890, 8611.8] + - [893, 8611.8] - - [448, 4, 1, 256] - - [795, 78.5534] + - [798, 78.5534] - - [5888, 448, 1, 128] - - [826, 3591.93] + - [829, 3591.93] - - [512, 48000, 1, 2560] - - [860, 9237.34] + - [863, 9237.34] - - [8448, 16, 1, 2816] - - [725, 3360.11] + - [728, 3360.11] - - [704, 6784, 1, 3328] - - [861, 7774.85] + - [864, 7774.85] - - [5888, 5888, 1, 1280] - - [847, 9238.15] + - [850, 9238.15] - - [5056, 1024, 1, 1280] - - [875, 8227.78] + - [878, 8227.78] - - [448, 5888, 1, 3328] - - [850, 7777.53] + - [853, 7777.53] - - [3072, 2, 1, 1024] - - [787, 376.283] + - [790, 376.283] - - [1024, 2944, 1, 1280] - - [840, 8650.35] + - [843, 8650.35] - - [5056, 5888, 1, 1280] - - [850, 8861.5] + - [853, 8861.5] - - [4288, 5888, 1, 128] - - [830, 5048.91] + - [833, 5048.91] - - [256, 3584, 1, 256] - - [842, 6314.01] + - [845, 6314.01] - - [256, 4, 1, 1280] - - [886, 163.84] + - [889, 163.84] - - [1408, 3584, 1, 128] - - [830, 4290.12] + - [833, 4290.12] - - [256, 2944, 1, 3328] - - [852, 7620.89] + - [855, 7620.89] - - [448, 3584, 1, 128] - - [830, 3353.8] + - [833, 3353.8] - - [5888, 2944, 1, 1280] - - [840, 9498.21] + - [843, 9498.21] - - [4, 6784, 1, 1280] - - [790, 623.816] + - [793, 623.816] - - [2368, 5888, 1, 128] - - [829, 4840.19] + - [832, 4840.19] - - [35, 8457, 1, 1760] - - [736, 4059.78] + - [739, 4059.78] - - [64, 2944, 1, 128] - - [747, 1310.72] + - [750, 1310.72] - - [2368, 4, 1, 256] - - [881, 369.639] + - [884, 369.639] - - [3584, 5888, 1, 256] - - [858, 7996.23] + - [861, 7996.23] - - [2368, 1024, 1, 128] - - [824, 3914.97] + - [827, 3914.97] - - [2368, 704, 1, 128] - - [824, 3658.87] + - [827, 3658.87] - - [512, 32, 1, 512] - - [813, 1127.5] + - [816, 1127.5] - - [3584, 2368, 1, 128] - - [824, 4462.38] + - [827, 4462.38] - - [5056, 704, 1, 128] - - [823, 4062.11] + - [826, 4062.11] - - [448, 2368, 1, 128] - - [824, 2828.97] + - [827, 2828.97] - - [4, 5056, 1, 256] - - [772, 425.768] + - [775, 425.768] - - [5056, 1408, 1, 3328] - - [857, 8848.82] + - [860, 8848.82] - - [1408, 704, 1, 256] - - [852, 5394.46] + - [855, 5394.46] - - [6784, 1024, 1, 3328] - - [839, 9231.92] + - [842, 9231.92] - - [6784, 2944, 1, 3328] - - [850, 8714.74] + - [853, 8714.74] - - [7680, 1, 1, 2560] - - [786, 248.745] + - [789, 248.745] - - [1856, 1856, 1, 256] - - [851, 7586.48] + - [854, 7586.48] - - [64, 64, 1, 3328] - - [821, 1363.15] + - [824, 1363.15] - - [512, 1, 1, 512] - - [735, 43.1158] + - [738, 43.1158] - - [6784, 2368, 1, 1280] - - [852, 8665.64] + - [855, 8665.64] - - [4608, 2, 1, 1536] - - [735, 452.55] + - [738, 452.55] - - [4288, 3584, 1, 256] - - [860, 8936.6] + - [863, 8936.6] - - [4288, 5888, 1, 1280] - - [857, 8957.05] + - [860, 8957.05] - - [4608, 4, 1, 1536] - - [728, 846.637] + - [731, 846.637] - - [1024, 6000, 1, 1536] - - [850, 8398.44] + - [853, 8398.44] - - [8448, 32, 1, 2816] - - [810, 5342.97] + - [813, 5342.97] - - [448, 2944, 1, 3328] - - [857, 7246.94] + - [860, 7246.94] - - [4288, 1856, 1, 1280] - - [840, 8902.76] + - [843, 8902.76] - - [1856, 2944, 1, 3328] - - [852, 8622.76] + - [855, 8622.76] - - [256, 6784, 1, 3328] - - [852, 8050.67] + - [855, 8050.67] - - [512, 3000, 1, 1536] - - [873, 7108.02] + - [876, 7108.02] - - [64, 5888, 1, 256] - - [865, 3567.64] + - [868, 3567.64] - - [256, 5056, 1, 128] - - [832, 3041.02] + - [835, 3041.02] - - [5056, 1024, 1, 256] - - [856, 8401.37] + - [859, 8401.37] - - [704, 64, 1, 3328] - - [815, 4298.92] + - [818, 4298.92] - - [5056, 1856, 1, 3328] - - [860, 8660.67] + - [863, 8660.67] - - [4, 2944, 1, 3328] - - [790, 618.537] + - [793, 618.537] - - [512, 1500, 1, 2048] - - [872, 5481.12] + - [875, 5481.12] - - [1024, 1, 1, 500000] - - [726, 259.961] + - [729, 259.961] - - [256, 4, 1, 256] - - [790, 50.4123] + - [793, 50.4123] - - [6784, 128, 1, 3328] - - [844, 6950.81] + - [847, 6950.81] - - [4288, 1408, 1, 128] - - [824, 4539.48] + - [827, 4539.48] - - [1856, 5888, 1, 3328] - - [850, 8712.83] + - [853, 8712.83] - - [4288, 5056, 1, 256] - - [856, 8997.05] + - [859, 8997.05] - - [1408, 128, 1, 1280] - - [779, 4599.02] + - [782, 4599.02] - - [4096, 7000, 1, 4096] - - [846, 8555.79] + - [849, 8555.79] - - [5056, 256, 1, 3328] - - [852, 8257.06] + - [855, 8257.06] - - [704, 704, 1, 256] - - [842, 5852.29] + - [845, 5852.29] - - [1024, 3000, 1, 2560] - - [839, 8258.74] + - [842, 8258.74] - - [1024, 5888, 1, 1280] - - [839, 8988.89] + - [842, 8988.89] - - [6784, 2368, 1, 128] - - [825, 4562.15] + - [828, 4562.15] - - [4, 5056, 1, 1280] - - [790, 600.341] + - [793, 600.341] - - [256, 64, 1, 1280] - - [813, 1899.59] + - [816, 1899.59] - - [128, 1856, 1, 1280] - - [852, 5185.66] + - [855, 5185.66] - - [1856, 1024, 1, 1280] - - [857, 7875.85] + - [860, 7875.85] - - [6784, 4288, 1, 1280] - - [860, 8981.08] + - [863, 8981.08] - - [1856, 1856, 1, 1280] - - [841, 7794.61] + - [844, 7794.61] - - [35, 1500, 1, 2048] - - [741, 2192.5] + - [744, 2192.5] - - [3072, 24000, 1, 1024] - - [853, 8690.48] + - [856, 8690.48] - - [1408, 5056, 1, 1280] - - [852, 8427.77] + - [855, 8427.77] - - [4, 2368, 1, 3328] - - [795, 594.322] + - [798, 594.322] - - [5888, 1856, 1, 128] - - [824, 4293.95] + - [827, 4293.95] - - [448, 704, 1, 1280] - - [847, 4136.29] + - [850, 4136.29] - - [448, 6784, 1, 128] - - [825, 3976.1] + - [828, 3976.1] - - [1024, 448, 1, 3328] - - [857, 6376.23] + - [860, 6376.23] - - [2944, 128, 1, 256] - - [842, 4466.16] + - [845, 4466.16] - - [5056, 3584, 1, 128] - - [830, 4997.08] + - [833, 4997.08] - - [5888, 5888, 1, 3328] - - [860, 8870.27] + - [863, 8870.27] - - [6784, 1024, 1, 256] - - [839, 8520.43] + - [842, 8520.43] - - [2944, 2368, 1, 256] - - [876, 6174.49] + - [879, 6174.49] - - [256, 448, 1, 256] - - [852, 1844.23] + - [855, 1844.23] - - [5056, 5888, 1, 3328] - - [841, 8076.55] + - [844, 8076.55] - - [1856, 1024, 1, 256] - - [852, 7188.82] + - [855, 7188.82] - - [512, 48000, 1, 1536] - - [863, 7282.1] + - [866, 7282.1] - - [3584, 448, 1, 1280] - - [842, 6869.0] + - [845, 6869.0] - - [1024, 1024, 1, 1280] - - [852, 8027.35] + - [855, 8027.35] - - [448, 5888, 1, 256] - - [842, 5765.74] + - [845, 5765.74] - - [2048, 128, 1, 2048] - - [800, 4834.91] + - [803, 4834.91] - - [1408, 6784, 1, 3328] - - [852, 8613.66] + - [855, 8613.66] - - [448, 1024, 1, 128] - - [823, 2315.47] + - [826, 2315.47] - - [4288, 704, 1, 128] - - [824, 4138.82] + - [827, 4138.82] - - [128, 1856, 1, 128] - - [759, 1397.46] + - [762, 1397.46] - - [448, 2368, 1, 3328] - - [842, 6786.38] + - [845, 6786.38] - - [5056, 64, 1, 128] - - [824, 1664.74] + - [827, 1664.74] - - [5056, 2944, 1, 256] - - [875, 7697.39] + - [878, 7697.39] - - [6784, 5888, 1, 128] - - [824, 5003.57] + - [827, 5003.57] - - [1024, 700, 1, 512] - - [852, 6036.21] + - [855, 6036.21] - - [3072, 1, 1, 128] - - [806, 70.2171] + - [809, 70.2171] - - [1024, 4, 1, 256] - - [764, 154.202] + - [767, 154.202] - - [2944, 704, 1, 128] - - [830, 3696.9] + - [833, 3696.9] - - [128, 6784, 1, 1280] - - [842, 6731.41] + - [845, 6731.41] - - [1408, 3584, 1, 3328] - - [840, 9257.97] + - [843, 9257.97] - - [2368, 6784, 1, 256] - - [839, 8840.3] + - [842, 8840.3] - - [5056, 1408, 1, 1280] - - [840, 9240.74] + - [843, 9240.74] - - [5056, 4288, 1, 128] - - [835, 4309.08] + - [838, 4309.08] - - [4, 704, 1, 256] - - [790, 130.597] + - [793, 130.597] - - [4288, 2368, 1, 3328] - - [853, 8755.23] + - [856, 8755.23] - - [1408, 1856, 1, 128] - - [823, 3918.65] + - [826, 3918.65] - - [1408, 5888, 1, 3328] - - [860, 8910.37] + - [863, 8910.37] - - [1856, 256, 1, 256] - - [842, 5631.24] + - [845, 5631.24] - - [6784, 6784, 1, 256] - - [850, 9298.66] + - [853, 9298.66] - - [5888, 5056, 1, 128] - - [825, 4811.26] + - [828, 4811.26] - - [4288, 2368, 1, 128] - - [824, 4749.0] + - [827, 4749.0] - - [128, 5888, 1, 1280] - - [851, 6393.76] + - [854, 6393.76] - - [256, 4288, 1, 1280] - - [842, 6887.69] + - [845, 6887.69] - - [2368, 2944, 1, 256] - - [856, 8314.72] + - [859, 8314.72] - - [4, 1856, 1, 256] - - [879, 266.93] + - [882, 266.93] - - [3584, 1856, 1, 1280] - - [840, 8631.81] + - [843, 8631.81] - - [6784, 6784, 1, 128] - - [830, 5059.86] + - [833, 5059.86] - - [256, 1856, 1, 128] - - [823, 1858.72] + - [826, 1858.72] - - [704, 64, 1, 1280] - - [773, 2849.39] + - [776, 2849.39] - - [5888, 5056, 1, 256] - - [859, 8202.42] + - [862, 8202.42] - - [8448, 48000, 1, 2816] - - [850, 4281.84] + - [853, 4281.84] - - [512, 6000, 1, 2048] - - [842, 8047.79] + - [845, 8047.79] - - [3584, 448, 1, 256] - - [852, 6805.33] + - [855, 6805.33] - - [448, 4288, 1, 128] - - [830, 3500.73] + - [833, 3500.73] - - [7680, 64, 1, 2560] - - [785, 5957.8] + - [788, 5957.8] - - [256, 6784, 1, 256] - - [852, 7331.73] + - [855, 7331.73] - - [1408, 4288, 1, 128] - - [824, 4501.39] + - [827, 4501.39] - - [2944, 704, 1, 3328] - - [852, 8439.6] + - [855, 8439.6] - - [128, 448, 1, 256] - - [773, 1555.09] + - [776, 1555.09] - - [2048, 32, 1, 2048] - - [784, 3226.39] + - [787, 3226.39] - - [3584, 3584, 1, 256] - - [856, 8784.8] + - [859, 8784.8] - - [448, 1408, 1, 128] - - [823, 2535.82] + - [826, 2535.82] - - [128, 256, 1, 1280] - - [799, 2896.62] + - [802, 2896.62] - - [3584, 5056, 1, 256] - - [843, 8566.42] + - [846, 8566.42] - - [6784, 128, 1, 256] - - [842, 6053.87] + - [845, 6053.87] - - [4288, 4, 1, 256] - - [762, 428.8] + - [765, 428.8] - - [64, 1408, 1, 3328] - - [767, 5025.01] + - [770, 5025.01] - - [704, 448, 1, 256] - - [866, 3409.64] + - [869, 3409.64] - - [2944, 2368, 1, 1280] - - [840, 9066.25] + - [843, 9066.25] - - [448, 64, 1, 3328] - - [815, 3528.86] + - [818, 3528.86] - - [704, 6784, 1, 128] - - [829, 4212.51] + - [832, 4212.51] - - [3584, 4, 1, 3328] - - [882, 658.253] + - [885, 658.253] - - [6784, 3584, 1, 256] - - [850, 9061.74] + - [853, 9061.74] - - [704, 448, 1, 128] - - [829, 1552.7] + - [832, 1552.7] - - [256, 128, 1, 128] - - [754, 281.875] + - [757, 281.875] - - [704, 1408, 1, 128] - - [829, 3026.66] + - [832, 3026.66] - - [4, 448, 1, 128] - - [878, 5.46127] + - [881, 5.46127] - - [4288, 128, 1, 1280] - - [809, 5471.54] + - [812, 5471.54] - - [128, 1408, 1, 256] - - [852, 2813.25] + - [855, 2813.25] - - [4, 2944, 1, 256] - - [772, 316.666] + - [775, 316.666] - - [64, 128, 1, 3328] - - [820, 1872.46] + - [823, 1872.46] - - [1856, 1408, 1, 256] - - [842, 7735.79] + - [845, 7735.79] - - [5056, 2368, 1, 128] - - [824, 4830.09] + - [827, 4830.09] - - [2944, 2944, 1, 3328] - - [860, 8890.01] + - [863, 8890.01] - - [5056, 6784, 1, 256] - - [850, 9015.15] + - [853, 9015.15] - - [1856, 3584, 1, 128] - - [831, 4455.02] + - [834, 4455.02] - - [5888, 4, 1, 1280] - - [880, 641.963] + - [883, 641.963] - - [128, 2944, 1, 128] - - [749, 2036.93] + - [752, 2036.93] - - [35, 8457, 1, 2560] - - [737, 3988.13] + - [740, 3988.13] - - [3584, 6784, 1, 128] - - [824, 4774.44] + - [827, 4774.44] - - [128, 4288, 1, 256] - - [842, 4851.75] + - [845, 4851.75] - - [704, 448, 1, 3328] - - [857, 4432.53] + - [860, 4432.53] - - [2368, 6784, 1, 1280] - - [840, 9161.38] + - [843, 9161.38] - - [128, 128, 1, 3328] - - [814, 2839.89] + - [817, 2839.89] - - [5056, 1856, 1, 256] - - [856, 8380.84] + - [859, 8380.84] - - [256, 128, 1, 256] - - [798, 1165.08] + - [801, 1165.08] - - [1024, 3000, 1, 2816] - - [857, 8714.17] + - [860, 8714.17] - - [1024, 1856, 1, 256] - - [847, 7014.69] + - [850, 7014.69] - - [64, 1, 1, 1216] - - [820, 11.7205] + - [823, 11.7205] - - [4288, 64, 1, 128] - - [751, 1669.55] + - [754, 1669.55] - - [256, 448, 1, 3328] - - [775, 5152.29] + - [778, 5152.29] - - [1408, 6784, 1, 1280] - - [860, 8735.12] + - [863, 8735.12] - - [3584, 3584, 1, 1280] - - [857, 9019.99] + - [860, 9019.99] - - [7680, 24000, 1, 2560] - - [860, 6940.14] + - [863, 6940.14] - - [64, 2368, 1, 1280] - - [770, 4432.97] + - [773, 4432.97] - - [448, 2368, 1, 1280] - - [845, 5352.82] + - [848, 5352.82] - - [4608, 48000, 1, 1536] - - [839, 8129.01] + - [842, 8129.01] - - [5888, 5888, 1, 128] - - [832, 4700.81] + - [835, 4700.81] - - [64, 6784, 1, 3328] - - [842, 6170.72] + - [845, 6170.72] - - [2944, 256, 1, 1280] - - [872, 6177.55] + - [875, 6177.55] - - [2048, 16, 1, 2048] - - [794, 2167.6] + - [797, 2167.6] - - [256, 2368, 1, 128] - - [823, 2037.67] + - [826, 2037.67] - - [5056, 2368, 1, 3328] - - [840, 9040.5] + - [843, 9040.5] - - [2944, 4288, 1, 256] - - [871, 7552.12] + - [874, 7552.12] - - [1408, 3584, 1, 1280] - - [847, 8808.66] + - [850, 8808.66] - - [2368, 64, 1, 256] - - [783, 2320.41] + - [786, 2320.41] - - [1024, 128, 1, 128] - - [743, 1075.46] + - [746, 1075.46] - - [704, 128, 1, 3328] - - [776, 4984.92] + - [779, 4984.92] - - [5888, 4, 1, 128] - - [877, 33.5558] + - [880, 33.5558] - - [1856, 704, 1, 256] - - [852, 7110.88] + - [855, 7110.88] - - [1024, 1500, 1, 2816] - - [847, 8499.78] + - [850, 8499.78] - - [8448, 1, 1, 2816] - - [730, 251.369] + - [733, 251.369] - - [1024, 4, 1, 3328] - - [886, 540.932] + - [889, 540.932] - - [1024, 6000, 1, 2048] - - [847, 8698.49] + - [850, 8698.49] - - [512, 24000, 1, 2560] - - [840, 8963.6] + - [843, 8963.6] - - [6144, 3000, 1, 2560] - - [863, 8761.87] + - [866, 8761.87] - - [2368, 6784, 1, 3328] - - [857, 8867.39] + - [860, 8867.39] - - [1856, 1408, 1, 1280] - - [844, 7908.43] + - [847, 7908.43] - - [1856, 448, 1, 1280] - - [857, 6543.91] + - [860, 6543.91] - - [6784, 704, 1, 128] - - [823, 4086.35] + - [826, 4086.35] - - [4, 4, 1, 256] - - [790, 0.752941] + - [793, 0.752941] - - [128, 5888, 1, 128] - - [747, 2582.15] + - [750, 2582.15] - - [5056, 2944, 1, 128] - - [827, 4579.07] + - [830, 4579.07] - - [1408, 5888, 1, 256] - - [839, 8810.67] + - [842, 8810.67] - - [704, 2944, 1, 1280] - - [854, 8420.8] + - [857, 8420.8] - - [4288, 64, 1, 1280] - - [779, 4906.05] + - [782, 4906.05] - - [256, 64, 1, 256] - - [781, 689.853] + - [784, 689.853] - - [1024, 1024, 1, 256] - - [857, 5527.91] + - [860, 5527.91] - - [704, 1856, 1, 256] - - [841, 4452.82] + - [844, 4452.82] - - [2560, 64, 1, 2560] - - [770, 4562.99] + - [773, 4562.99] - - [3584, 704, 1, 1280] - - [847, 7898.67] + - [850, 7898.67] - - [256, 128, 1, 1280] - - [799, 2864.96] + - [802, 2864.96] - - [5888, 2368, 1, 256] - - [846, 8628.27] + - [849, 8628.27] - - [256, 2368, 1, 1280] - - [842, 6073.47] + - [845, 6073.47] - - [2944, 6784, 1, 128] - - [823, 4756.67] + - [826, 4756.67] - - [3584, 448, 1, 3328] - - [842, 7264.97] + - [845, 7264.97] - - [1408, 4, 1, 256] - - [883, 234.057] + - [886, 234.057] - - [704, 2368, 1, 3328] - - [840, 7248.88] + - [843, 7248.88] - - [2944, 448, 1, 256] - - [847, 6365.79] + - [850, 6365.79] - - [1856, 448, 1, 128] - - [825, 2976.24] + - [828, 2976.24] - - [4608, 6000, 1, 1536] - - [860, 9469.32] + - [863, 9469.32] - - [2368, 128, 1, 1280] - - [809, 4773.29] + - [812, 4773.29] - - [256, 5888, 1, 128] - - [824, 3111.9] + - [827, 3111.9] - - [64, 6784, 1, 256] - - [842, 3755.04] + - [845, 3755.04] - - [64, 5056, 1, 1280] - - [803, 4935.5] + - [806, 4935.5] - - [4, 6784, 1, 128] - - [878, 111.042] + - [881, 111.042] - - [3025, 64, 64, 64] - - [892, 6643.65] + - [895, 6643.65] - - [2944, 2944, 1, 1280] - - [840, 8869.45] + - [843, 8869.45] - - [5056, 448, 1, 3328] - - [873, 6706.1] + - [876, 6706.1] - - [4, 3584, 1, 1280] - - [790, 573.44] + - [793, 573.44] - - [1408, 128, 1, 128] - - [742, 1293.09] + - [745, 1293.09] - - [6784, 704, 1, 3328] - - [857, 8368.23] + - [860, 8368.23] - - [128, 64, 1, 1280] - - [816, 1260.31] + - [819, 1260.31] - - [2368, 256, 1, 1280] - - [842, 6154.37] + - [845, 6154.37] - - [4, 448, 1, 3328] - - [795, 351.638] + - [798, 351.638] - - [5888, 4288, 1, 128] - - [824, 4340.89] + - [827, 4340.89] - - [4, 5888, 1, 256] - - [772, 428.218] + - [775, 428.218] - - [1408, 2944, 1, 3328] - - [839, 9400.75] + - [842, 9400.75] - - [3584, 704, 1, 128] - - [826, 3392.45] + - [829, 3392.45] - - [64, 1024, 1, 256] - - [773, 1762.31] + - [776, 1762.31] - - [2368, 448, 1, 1280] - - [866, 5972.48] + - [869, 5972.48] - - [128, 3584, 1, 256] - - [842, 5224.22] + - [845, 5224.22] - - [704, 448, 1, 1280] - - [842, 4566.76] + - [845, 4566.76] - - [448, 5056, 1, 128] - - [824, 3876.09] + - [827, 3876.09] - - [6144, 4, 1, 2560] - - [766, 948.651] + - [769, 948.651] - - [5056, 3584, 1, 256] - - [856, 8162.46] + - [859, 8162.46] - - [4288, 4288, 1, 256] - - [863, 7653.24] + - [866, 7653.24] - - [1408, 5056, 1, 128] - - [830, 4554.24] + - [833, 4554.24] - - [2944, 3584, 1, 128] - - [836, 4146.9] + - [839, 4146.9] - - [3584, 2368, 1, 256] - - [857, 8194.95] + - [860, 8194.95] - - [5888, 5056, 1, 1280] - - [856, 9413.33] + - [859, 9413.33] - - [128, 1024, 1, 1280] - - [809, 4433.73] + - [812, 4433.73] - - [8448, 24000, 1, 2816] - - [850, 5227.02] + - [853, 5227.02] - - [64, 704, 1, 256] - - [773, 1441.79] + - [776, 1441.79] - - [4288, 256, 1, 1280] - - [872, 5687.7] + - [875, 5687.7] - - [3584, 3584, 1, 3328] - - [847, 9183.53] + - [850, 9183.53] - - [704, 64, 1, 128] - - [751, 402.735] + - [754, 402.735] - - [3072, 1500, 1, 128] - - [846, 7394.98] + - [849, 7394.98] - - [2048, 3136, 1, 512] - - [888, 8447.2] + - [891, 8447.2] - - [3025, 256, 64, 64] - - [896, 8063.69] + - [899, 8063.69] - - [5888, 6784, 1, 256] - - [840, 9281.91] + - [843, 9281.91] - - [4288, 2944, 1, 3328] - - [840, 9153.77] + - [843, 9153.77] - - [2944, 64, 1, 128] - - [757, 1463.43] + - [760, 1463.43] - - [1024, 128, 1, 3328] - - [807, 5377.31] + - [810, 5377.31] - - [1024, 16, 1, 500000] - - [723, 3997.03] + - [726, 3997.03] - - [4288, 128, 1, 3328] - - [811, 6053.21] + - [814, 6053.21] - - [7680, 128, 1, 2560] - - [857, 7769.14] + - [860, 7769.14] - - [256, 5056, 1, 1280] - - [866, 7200.74] + - [869, 7200.74] - - [1408, 256, 1, 128] - - [834, 1671.64] + - [837, 1671.64] - - [2944, 5888, 1, 3328] - - [846, 8642.08] + - [849, 8642.08] - - [6784, 5888, 1, 1280] - - [860, 8871.05] + - [863, 8871.05] - - [3072, 1, 1, 1024] - - [806, 205.872] + - [809, 205.872] - - [704, 128, 1, 256] - - [769, 1935.29] + - [772, 1935.29] - - [5888, 4288, 1, 1280] - - [847, 9176.6] + - [850, 9176.6] - - [1024, 24000, 1, 2048] - - [846, 8667.69] + - [849, 8667.69] - - [448, 256, 1, 1280] - - [779, 4327.85] + - [782, 4327.85] - - [5888, 3584, 1, 128] - - [824, 4669.35] + - [827, 4669.35] - - [64, 4288, 1, 3328] - - [804, 5374.94] + - [807, 5374.94] - - [448, 4, 1, 1280] - - [795, 289.616] + - [798, 289.616] - - [6784, 6784, 1, 3328] - - [853, 8306.63] + - [856, 8306.63] - - [5056, 4, 1, 1280] - - [765, 607.099] + - [768, 607.099] - - [4, 5888, 1, 3328] - - [790, 651.438] + - [793, 651.438] - - [256, 1408, 1, 1280] - - [842, 5176.99] + - [845, 5176.99] - - [3072, 16, 1, 1024] - - [801, 2207.53] + - [804, 2207.53] - - [704, 3584, 1, 128] - - [834, 3653.41] + - [837, 3653.41] - - [1024, 2, 1, 512] - - [821, 156.038] + - [824, 156.038] - - [5888, 448, 1, 3328] - - [842, 7896.75] + - [845, 7896.75] - - [2368, 4288, 1, 1280] - - [839, 8517.53] + - [842, 8517.53] - - [4288, 2944, 1, 128] - - [828, 4439.16] + - [831, 4439.16] - - [256, 64, 1, 3328] - - [814, 2704.66] + - [817, 2704.66] - - [2944, 64, 1, 3328] - - [779, 5647.05] + - [782, 5647.05] - - [6784, 64, 1, 3328] - - [852, 6434.51] + - [855, 6434.51] - - [5056, 2944, 1, 3328] - - [863, 8497.1] + - [866, 8497.1] - - [448, 128, 1, 256] - - [781, 1516.54] + - [784, 1516.54] - - [2944, 3584, 1, 256] - - [857, 8365.73] + - [860, 8365.73] - - [1408, 1408, 1, 3328] - - [840, 8440.32] + - [843, 8440.32] - - [1856, 128, 1, 1280] - - [842, 5242.83] + - [845, 5242.83] - - [3584, 3584, 1, 128] - - [824, 4385.84] + - [827, 4385.84] - - [64, 3584, 1, 256] - - [842, 3276.8] + - [845, 3276.8] - - [1408, 4, 1, 3328] - - [765, 605.404] + - [768, 605.404] - - [128, 2944, 1, 3328] - - [810, 6295.65] + - [813, 6295.65] - - [3584, 704, 1, 256] - - [847, 7711.54] + - [850, 7711.54] - - [2944, 448, 1, 3328] - - [858, 6503.87] + - [861, 6503.87] - - [1024, 2, 1, 500000] - - [727, 521.703] + - [730, 521.703] - - [3584, 1408, 1, 3328] - - [849, 8296.1] + - [852, 8296.1] - - [704, 3584, 1, 1280] - - [854, 7670.55] + - [857, 7670.55] - - [1024, 1408, 1, 128] - - [829, 2830.51] + - [832, 2830.51] - - [1856, 6784, 1, 256] - - [860, 8149.57] + - [863, 8149.57] - - [4288, 448, 1, 3328] - - [841, 7406.34] + - [844, 7406.34] - - [6784, 4288, 1, 128] - - [836, 4417.99] + - [839, 4417.99] - - [6784, 704, 1, 1280] - - [857, 8302.35] + - [860, 8302.35] - - [6144, 1, 1, 2560] - - [766, 243.327] + - [769, 243.327] - - [3584, 6784, 1, 256] - - [839, 9036.49] + - [842, 9036.49] - - [6144, 16, 1, 2560] - - [773, 3266.59] + - [776, 3266.59] - - [3584, 64, 1, 128] - - [757, 1555.09] + - [760, 1555.09] - - [5888, 1024, 1, 3328] - - [847, 8887.98] + - [850, 8887.98] - - [448, 64, 1, 128] - - [743, 247.974] + - [746, 247.974] - - [704, 6784, 1, 1280] - - [843, 7892.46] + - [846, 7892.46] - - [4, 448, 1, 256] - - [765, 70.7951] + - [768, 70.7951] - - [5888, 128, 1, 256] - - [841, 5714.99] + - [844, 5714.99] - - [4096, 16, 1, 4096] - - [787, 3251.4] + - [790, 3251.4] - - [1856, 5056, 1, 3328] - - [856, 8740.17] + - [859, 8740.17] - - [4, 6784, 1, 256] - - [879, 360.312] + - [882, 360.312] - - [1024, 3584, 1, 128] - - [824, 3456.17] + - [827, 3456.17] - - [64, 704, 1, 3328] - - [792, 3817.37] + - [795, 3817.37] - - [2368, 2944, 1, 128] - - [830, 4605.37] + - [833, 4605.37] - - [5056, 64, 1, 256] - - [842, 3863.69] + - [845, 3863.69] - - [512, 1500, 1, 1536] - - [842, 6801.46] + - [845, 6801.46] - - [512, 1, 1, 500000] - - [731, 260.968] + - [734, 260.968] - - [5888, 2944, 1, 3328] - - [846, 8501.78] + - [849, 8501.78] - - [128, 3584, 1, 1280] - - [847, 5938.54] + - [850, 5938.54] - - [1024, 704, 1, 128] - - [833, 2172.19] + - [836, 2172.19] - - [1408, 2368, 1, 128] - - [829, 4023.1] + - [832, 4023.1] - - [5888, 2368, 1, 128] - - [830, 4424.52] + - [833, 4424.52] - - [128, 5056, 1, 3328] - - [842, 6692.06] + - [845, 6692.06] - - [3584, 6784, 1, 1280] - - [840, 9488.54] + - [843, 9488.54] - - [4288, 1856, 1, 256] - - [850, 8287.42] + - [853, 8287.42] - - [1856, 5888, 1, 256] - - [861, 7707.73] + - [864, 7707.73] - - [256, 256, 1, 256] - - [808, 1613.19] + - [811, 1613.19] - - [4288, 4288, 1, 3328] - - [850, 8923.49] + - [853, 8923.49] - - [1024, 1024, 1, 128] - - [830, 2553.61] + - [833, 2553.61] - - [4288, 1408, 1, 1280] - - [850, 8930.37] + - [853, 8930.37] - - [3584, 5056, 1, 128] - - [834, 4495.05] + - [837, 4495.05] - - [4, 1024, 1, 3328] - - [790, 415.594] + - [793, 415.594] - - [4, 704, 1, 128] - - [878, 13.8634] + - [881, 13.8634] - - [4288, 2368, 1, 256] - - [875, 7134.98] + - [878, 7134.98] - - [2944, 5056, 1, 1280] - - [847, 9118.51] + - [850, 9118.51] - - [448, 6784, 1, 256] - - [871, 5430.21] + - [874, 5430.21] - - [64, 128, 1, 128] - - [754, 82.957] + - [757, 82.957] - - [1856, 2368, 1, 128] - - [830, 4422.65] + - [833, 4422.65] - - [6784, 2368, 1, 3328] - - [843, 8769.3] + - [846, 8769.3] - - [1408, 6784, 1, 128] - - [830, 4738.9] + - [833, 4738.9] - - [256, 1024, 1, 1280] - - [852, 5722.11] + - [855, 5722.11] - - [704, 4, 1, 128] - - [878, 8.56578] + - [881, 8.56578] - - [1408, 4, 1, 128] - - [878, 26.0439] + - [881, 26.0439] - - [4288, 128, 1, 256] - - [852, 4865.28] + - [855, 4865.28] - - [4288, 1856, 1, 3328] - - [839, 9249.94] + - [842, 9249.94] - - [3584, 448, 1, 128] - - [830, 3029.49] + - [833, 3029.49] - - [64, 4288, 1, 128] - - [747, 1535.28] + - [750, 1535.28] - - [64, 448, 1, 3328] - - [817, 3457.26] + - [820, 3457.26] - - [448, 4, 1, 3328] - - [795, 367.228] + - [798, 367.228] - - [256, 4, 1, 3328] - - [886, 320.289] + - [889, 320.289] - - [4, 1408, 1, 1280] - - [883, 343.939] + - [886, 343.939] - - [3584, 64, 1, 1280] - - [771, 5190.97] + - [774, 5190.97] - - [1408, 448, 1, 128] - - [831, 2218.14] + - [834, 2218.14] - - [3584, 1024, 1, 1280] - - [853, 8253.01] + - [856, 8253.01] - - [1856, 5056, 1, 256] - - [871, 7552.45] + - [874, 7552.45] - - [4, 3584, 1, 256] - - [790, 325.356] + - [793, 325.356] - - [6784, 4288, 1, 3328] - - [846, 8655.24] + - [849, 8655.24] - - [4, 2944, 1, 1280] - - [790, 547.721] + - [793, 547.721] - - [1024, 4288, 1, 256] - - [847, 7788.73] + - [850, 7788.73] - - [5888, 3584, 1, 3328] - - [850, 9173.29] + - [853, 9173.29] - - [1856, 4, 1, 256] - - [881, 282.819] + - [884, 282.819] - - [4, 256, 1, 256] - - [790, 49.6485] + - [793, 49.6485] - - [5056, 3584, 1, 3328] - - [856, 8457.43] + - [859, 8457.43] - - [1408, 128, 1, 3328] - - [810, 5714.42] + - [813, 5714.42] - - [4, 64, 1, 1280] - - [886, 42.6667] + - [889, 42.6667] - - [2368, 1408, 1, 1280] - - [847, 8224.82] + - [850, 8224.82] - - [5056, 2944, 1, 1280] - - [839, 9295.03] + - [842, 9295.03] - - [8448, 6000, 1, 2816] - - [843, 8037.87] + - [846, 8037.87] - - [4, 4, 1, 128] - - [878, 0.0433898] + - [881, 0.0433898] - - [3584, 256, 1, 256] - - [842, 6116.69] + - [845, 6116.69] - - [3584, 2944, 1, 1280] - - [839, 8796.39] + - [842, 8796.39] - - [1024, 6784, 1, 256] - - [846, 8187.76] + - [849, 8187.76] - - [4, 128, 1, 256] - - [790, 30.3407] + - [793, 30.3407] - - [6784, 448, 1, 256] - - [842, 7862.2] + - [845, 7862.2] - - [5124, 9124, 1, 2048] - - [844, 8176.31] + - [847, 8176.31] - - [2944, 5056, 1, 3328] - - [839, 9328.24] + - [842, 9328.24] - - [6784, 4, 1, 128] - - [877, 204.8] + - [880, 204.8] - - [2944, 1408, 1, 128] - - [828, 3838.1] + - [831, 3838.1] - - [448, 128, 1, 3328] - - [793, 4632.06] + - [796, 4632.06] - - [64, 2944, 1, 3328] - - [810, 5663.37] + - [813, 5663.37] - - [5056, 6784, 1, 3328] - - [846, 8420.07] + - [849, 8420.07] - - [704, 2368, 1, 128] - - [830, 3321.69] + - [833, 3321.69] - - [3072, 1500, 1, 1024] - - [847, 8221.67] + - [850, 8221.67] - - [128, 2944, 1, 256] - - [842, 4550.42] + - [845, 4550.42] - - [128, 6784, 1, 128] - - [747, 2767.66] + - [750, 2767.66] - - [3584, 4288, 1, 256] - - [846, 8808.54] + - [849, 8808.54] - - [448, 1856, 1, 256] - - [851, 5166.53] + - [854, 5166.53] - - [1856, 6784, 1, 3328] - - [843, 8339.66] + - [846, 8339.66] - - [3584, 128, 1, 3328] - - [852, 6791.47] + - [855, 6791.47] - - [64, 1856, 1, 256] - - [774, 2209.93] + - [777, 2209.93] - - [64, 448, 1, 256] - - [806, 1008.25] + - [809, 1008.25] - - [5888, 4288, 1, 256] - - [846, 8869.53] + - [849, 8869.53] - - [128, 1500, 1, 1280] - - [803, 4733.44] + - [806, 4733.44] - - [5056, 1408, 1, 256] - - [844, 7523.21] + - [847, 7523.21] - - [35, 8457, 1, 4096] - - [737, 4023.07] + - [740, 4023.07] - - [64, 256, 1, 1280] - - [798, 1941.81] + - [801, 1941.81] - - [2944, 4, 1, 128] - - [877, 95.6426] + - [880, 95.6426] - - [3584, 1024, 1, 256] - - [869, 6553.58] + - [872, 6553.58] - - [512, 6000, 1, 1536] - - [843, 7357.15] + - [846, 7357.15] - - [256, 704, 1, 256] - - [842, 2912.71] + - [845, 2912.71] - - [5888, 5888, 1, 256] - - [853, 8802.6] + - [856, 8802.6] - - [4288, 1024, 1, 1280] - - [846, 8248.73] + - [849, 8248.73] - - [5888, 128, 1, 3328] - - [796, 6848.49] + - [799, 6848.49] - - [448, 6784, 1, 3328] - - [842, 8343.68] + - [845, 8343.68] - - [2944, 1408, 1, 1280] - - [839, 9229.38] + - [842, 9229.38] - - [3072, 6000, 1, 1024] - - [860, 9014.91] + - [863, 9014.91] - - [1024, 32, 1, 512] - - [781, 1497.97] + - [784, 1497.97] - - [2944, 1856, 1, 3328] - - [856, 7176.38] + - [859, 7176.38] - - [2368, 64, 1, 128] - - [747, 1206.38] + - [750, 1206.38] - - [256, 1024, 1, 128] - - [824, 1178.18] + - [827, 1178.18] - - [3584, 5888, 1, 1280] - - [846, 9023.48] + - [849, 9023.48] - - [64, 4, 1, 128] - - [878, 0.989372] + - [881, 0.989372] - - [6784, 1856, 1, 1280] - - [840, 8964.41] + - [843, 8964.41] - - [2944, 5056, 1, 256] - - [846, 8860.02] + - [849, 8860.02] - - [5888, 256, 1, 3328] - - [857, 8308.56] + - [860, 8308.56] - - [2944, 4288, 1, 128] - - [825, 4507.51] + - [828, 4507.51] - - [3584, 1408, 1, 256] - - [840, 8234.61] + - [843, 8234.61] - - [704, 3584, 1, 3328] - - [852, 7377.16] + - [855, 7377.16] - - [5056, 448, 1, 1280] - - [841, 7145.37] + - [844, 7145.37] - - [3584, 1856, 1, 3328] - - [857, 8954.71] + - [860, 8954.71] - - [64, 1408, 1, 128] - - [754, 731.874] + - [757, 731.874] - - [4288, 6784, 1, 1280] - - [846, 9166.45] + - [849, 9166.45] - - [1024, 3000, 1, 2048] - - [857, 7723.73] + - [860, 7723.73] - - [1408, 704, 1, 1280] - - [847, 7863.0] + - [850, 7863.0] - - [2944, 1024, 1, 256] - - [840, 5034.92] + - [843, 5034.92] - - [256, 64, 1, 128] - - [746, 150.657] + - [749, 150.657] - - [2368, 4288, 1, 3328] - - [844, 8568.74] + - [847, 8568.74] - - [4, 1408, 1, 256] - - [790, 219.785] + - [793, 219.785] - - [1024, 1408, 1, 1280] - - [872, 6761.03] + - [875, 6761.03] - - [64, 64, 1, 256] - - [772, 198.594] + - [775, 198.594] - - [704, 256, 1, 3328] - - [842, 4291.52] + - [845, 4291.52] - - [6784, 5056, 1, 256] - - [841, 8544.92] + - [844, 8544.92] - - [1856, 1856, 1, 128] - - [829, 4034.83] + - [832, 4034.83] - - [4288, 5888, 1, 256] - - [860, 8997.95] + - [863, 8997.95] - - [4, 704, 1, 3328] - - [795, 452.3] + - [798, 452.3] - - [35, 8457, 1, 2048] - - [738, 3375.27] + - [741, 3375.27] - - [448, 2944, 1, 256] - - [842, 6346.64] + - [845, 6346.64] - - [4, 4288, 1, 3328] - - [795, 630.878] + - [798, 630.878] - - [2944, 6784, 1, 256] - - [869, 8002.82] + - [872, 8002.82] - - [2944, 2944, 1, 128] - - [824, 4661.31] + - [827, 4661.31] - - [4, 4, 1, 1280] - - [795, 3.04762] + - [798, 3.04762] - - [1856, 3584, 1, 1280] - - [839, 8677.56] + - [842, 8677.56] - - [64, 2944, 1, 256] - - [842, 2926.85] + - [845, 2926.85] - - [3584, 1408, 1, 1280] - - [853, 8238.8] + - [856, 8238.8] - - [448, 256, 1, 128] - - [754, 1042.62] + - [757, 1042.62] - - [4288, 448, 1, 128] - - [830, 3698.72] + - [833, 3698.72] - - [5056, 256, 1, 1280] - - [847, 7058.4] + - [850, 7058.4] - - [1856, 1408, 1, 3328] - - [844, 8348.25] + - [847, 8348.25] - - [128, 128, 1, 128] - - [754, 145.636] + - [757, 145.636] - - [1024, 4288, 1, 3328] - - [840, 8042.51] + - [843, 8042.51] - - [448, 2368, 1, 256] - - [852, 5934.9] + - [855, 5934.9] - - [1024, 4, 1, 128] - - [878, 15.83] + - [881, 15.83] - - [64, 1408, 1, 1280] - - [776, 3865.39] + - [779, 3865.39] - - [64, 6784, 1, 1280] - - [872, 5629.51] + - [875, 5629.51] - - [5056, 448, 1, 256] - - [842, 7637.81] + - [845, 7637.81] - - [2944, 2368, 1, 3328] - - [850, 9112.34] + - [853, 9112.34] - - [704, 4288, 1, 3328] - - [842, 7950.1] + - [845, 7950.1] - - [1408, 128, 1, 256] - - [842, 2898.07] + - [845, 2898.07] - - [1024, 1856, 1, 1280] - - [840, 8087.41] + - [843, 8087.41] - - [6784, 1856, 1, 256] - - [871, 7538.15] + - [874, 7538.15] - - [512, 48000, 1, 2816] - - [839, 9704.11] + - [842, 9704.11] - - [512, 3000, 1, 2816] - - [841, 7621.53] + - [844, 7621.53] - - [128, 2368, 1, 3328] - - [804, 6038.84] + - [807, 6038.84] - - [1024, 5888, 1, 256] - - [856, 8185.72] + - [859, 8185.72] - - [64, 2944, 1, 1280] - - [803, 4540.14] + - [806, 4540.14] - - [6784, 1408, 1, 256] - - [856, 8573.9] + - [859, 8573.9] - - [5056, 64, 1, 3328] - - [804, 6310.87] + - [807, 6310.87] - - [128, 704, 1, 128] - - [743, 696.518] + - [746, 696.518] - - [1408, 2368, 1, 256] - - [842, 4994.96] + - [845, 4994.96] - - [1408, 1408, 1, 256] - - [839, 7552.24] + - [842, 7552.24] - - [4, 64, 1, 128] - - [877, 1.80441] + - [880, 1.80441] - - [64, 128, 1, 1280] - - [816, 1272.54] + - [819, 1272.54] - - [1024, 8, 1, 500000] - - [724, 2013.13] + - [727, 2013.13] - - [4, 2368, 1, 128] - - [878, 49.8526] + - [881, 49.8526] - - [2368, 2368, 1, 128] - - [829, 4483.7] + - [832, 4483.7] - - [64, 5888, 1, 128] - - [746, 1957.57] + - [749, 1957.57] - - [5888, 4, 1, 3328] - - [879, 638.698] + - [882, 638.698] - - [6784, 1408, 1, 128] - - [824, 4715.51] + - [827, 4715.51] - - [1408, 5056, 1, 256] - - [856, 8557.57] + - [859, 8557.57] - - [512, 50176, 1, 128] - - [887, 8809.29] + - [890, 8809.29] - - [5056, 128, 1, 3328] - - [779, 6810.56] + - [782, 6810.56] - - [128, 128, 1, 1280] - - [813, 1899.59] + - [816, 1899.59] - - [512, 2, 1, 512] - - [733, 87.3813] + - [736, 87.3813] - - [448, 704, 1, 256] - - [852, 3765.87] + - [855, 3765.87] - - [4288, 3584, 1, 128] - - [837, 4563.67] + - [840, 4563.67] - - [2944, 128, 1, 3328] - - [779, 6507.35] + - [782, 6507.35] - - [128, 5056, 1, 1280] - - [842, 6557.75] + - [845, 6557.75] - - [3584, 5056, 1, 1280] - - [839, 9407.83] + - [842, 9407.83] - - [256, 448, 1, 1280] - - [803, 4096.0] + - [806, 4096.0] - - [704, 704, 1, 128] - - [829, 2374.21] + - [832, 2374.21] - - [5056, 4, 1, 128] - - [877, 125.42] + - [880, 125.42] - - [704, 256, 1, 1280] - - [852, 4016.13] + - [855, 4016.13] - - [64, 2368, 1, 3328] - - [809, 5159.19] + - [812, 5159.19] - - [1856, 1024, 1, 128] - - [829, 3356.37] + - [832, 3356.37] - - [1856, 64, 1, 128] - - [746, 945.544] + - [749, 945.544] - - [4096, 64, 1, 4096] - - [812, 6260.14] + - [815, 6260.14] - - [1024, 24000, 1, 1536] - - [856, 9368.4] + - [859, 9368.4] - - [704, 4288, 1, 256] - - [853, 7329.29] + - [856, 7329.29] - - [5888, 2368, 1, 1280] - - [842, 8624.61] + - [845, 8624.61] - - [6784, 1856, 1, 3328] - - [846, 9012.35] + - [849, 9012.35] - - [64, 128, 1, 256] - - [772, 374.491] + - [775, 374.491] - - [2368, 5888, 1, 1280] - - [840, 9045.66] + - [843, 9045.66] - - [5888, 256, 1, 1280] - - [857, 7999.07] + - [860, 7999.07] - - [4, 5888, 1, 1280] - - [790, 615.739] + - [793, 615.739] - - [704, 128, 1, 128] - - [746, 693.169] + - [749, 693.169] - - [1024, 4, 1, 1280] - - [885, 372.364] + - [888, 372.364] - - [2368, 1856, 1, 3328] - - [857, 8246.81] + - [860, 8246.81] - - [2368, 128, 1, 128] - - [747, 1963.43] + - [750, 1963.43] - - [2944, 704, 1, 256] - - [857, 7116.14] + - [860, 7116.14] - - [5056, 128, 1, 128] - - [750, 2519.39] + - [753, 2519.39] - - [2368, 1024, 1, 3328] - - [842, 7959.03] + - [845, 7959.03] - - [35, 700, 1, 2048] - - [738, 1766.76] + - [741, 1766.76] - - [256, 704, 1, 3328] - - [842, 4296.46] + - [845, 4296.46] - - [704, 3584, 1, 256] - - [841, 7441.51] + - [844, 7441.51] - - [704, 2944, 1, 3328] - - [858, 7195.71] + - [861, 7195.71] - - [6784, 1024, 1, 128] - - [829, 4509.08] + - [832, 4509.08] - - [256, 448, 1, 128] - - [754, 837.903] + - [757, 837.903] - - [448, 1024, 1, 3328] - - [852, 6515.55] + - [855, 6515.55] - - [2944, 1024, 1, 3328] - - [847, 8751.53] + - [850, 8751.53] - - [2944, 5056, 1, 128] - - [824, 4799.63] + - [827, 4799.63] - - [2368, 256, 1, 256] - - [841, 4754.57] + - [844, 4754.57] - - [1408, 6784, 1, 256] - - [869, 7476.99] + - [872, 7476.99] - - [6784, 1408, 1, 3328] - - [847, 8968.47] + - [850, 8968.47] - - [4288, 6784, 1, 128] - - [822, 4455.64] + - [825, 4455.64] - - [1408, 2944, 1, 128] - - [834, 3862.69] + - [837, 3862.69] - - [704, 64, 1, 256] - - [773, 1441.79] + - [776, 1441.79] - - [3072, 4, 1, 1024] - - [791, 711.703] + - [794, 711.703] - - [256, 2368, 1, 3328] - - [866, 5199.63] + - [869, 5199.63] - - [6784, 2944, 1, 1280] - - [850, 8914.35] + - [853, 8914.35] - - [4288, 1856, 1, 128] - - [830, 4683.2] + - [833, 4683.2] - - [1856, 2944, 1, 128] - - [824, 4589.24] + - [827, 4589.24] - - [6784, 448, 1, 128] - - [824, 3918.43] + - [827, 3918.43] - - [64, 3584, 1, 128] - - [755, 1468.01] + - [758, 1468.01] - - [448, 5056, 1, 1280] - - [847, 7561.3] + - [850, 7561.3] - - [4288, 5056, 1, 1280] - - [839, 9304.01] + - [842, 9304.01] - - [2368, 1856, 1, 128] - - [829, 4322.07] + - [832, 4322.07] - - [128, 448, 1, 1280] - - [809, 3336.38] + - [812, 3336.38] - - [4288, 704, 1, 256] - - [852, 7834.55] + - [855, 7834.55] - - [256, 3584, 1, 128] - - [825, 2500.86] + - [828, 2500.86] - - [5888, 704, 1, 256] - - [871, 7244.39] + - [874, 7244.39] - - [3584, 1024, 1, 128] - - [836, 3168.93] + - [839, 3168.93] - - [256, 5888, 1, 3328] - - [857, 7763.37] + - [860, 7763.37] - - [1408, 4288, 1, 3328] - - [839, 9273.7] + - [842, 9273.7] - - [6784, 4288, 1, 256] - - [847, 8825.1] + - [850, 8825.1] - - [4288, 256, 1, 128] - - [826, 2621.44] + - [829, 2621.44] - - [448, 1856, 1, 3328] - - [867, 5859.7] + - [870, 5859.7] - - [5888, 256, 1, 256] - - [857, 7124.74] + - [860, 7124.74] - - [1024, 4, 1, 500000] - - [722, 1030.1] + - [725, 1030.1] - - [6784, 1024, 1, 1280] - - [839, 9083.01] + - [842, 9083.01] - - [5888, 1024, 1, 128] - - [826, 4297.06] + - [829, 4297.06] - - [1024, 128, 1, 256] - - [842, 2086.72] + - [845, 2086.72] - - [512, 16, 1, 500000] - - [723, 3921.86] + - [726, 3921.86] - - [128, 64, 1, 3328] - - [813, 1969.87] + - [816, 1969.87] - - [448, 64, 1, 256] - - [798, 1092.27] + - [801, 1092.27] - - [2368, 256, 1, 128] - - [829, 2174.74] + - [832, 2174.74] - - [6784, 3584, 1, 1280] - - [839, 9558.72] + - [842, 9558.72] - - [1024, 6784, 1, 1280] - - [848, 8637.62] + - [851, 8637.62] - - [2944, 64, 1, 1280] - - [770, 4770.03] + - [773, 4770.03] - - [1408, 2944, 1, 1280] - - [839, 9238.37] + - [842, 9238.37] - - [256, 1856, 1, 256] - - [865, 4498.33] + - [868, 4498.33] - - [1408, 2368, 1, 3328] - - [847, 8344.87] + - [850, 8344.87] - - [2944, 4, 1, 3328] - - [882, 661.109] + - [885, 661.109] - - [128, 1408, 1, 3328] - - [810, 5641.32] + - [813, 5641.32] - - [2944, 1856, 1, 128] - - [824, 4487.94] + - [827, 4487.94] - - [256, 2944, 1, 128] - - [834, 2233.08] + - [837, 2233.08] - - [256, 6784, 1, 128] - - [823, 3139.8] + - [826, 3139.8] - - [2368, 4, 1, 128] - - [878, 38.6612] + - [881, 38.6612] - - [1408, 256, 1, 3328] - - [874, 4927.57] + - [877, 4927.57] - - [1856, 4, 1, 128] - - [878, 42.2719] + - [881, 42.2719] - - [1024, 16, 1, 512] - - [790, 1115.51] + - [793, 1115.51] - - [5056, 6784, 1, 128] - - [825, 4963.35] + - [828, 4963.35] - - [4288, 5056, 1, 128] - - [823, 4927.99] + - [826, 4927.99] - - [1856, 5888, 1, 128] - - [830, 4865.05] + - [833, 4865.05] - - [7680, 2, 1, 2560] - - [766, 499.512] + - [769, 499.512] - - [3584, 1856, 1, 256] - - [856, 7978.28] + - [859, 7978.28] - - [4288, 3584, 1, 1280] - - [856, 7852.16] + - [859, 7852.16] - - [2368, 448, 1, 256] - - [871, 5238.83] + - [874, 5238.83] - - [4288, 256, 1, 3328] - - [842, 6751.24] + - [845, 6751.24] - - [1856, 704, 1, 128] - - [824, 3525.46] + - [827, 3525.46] - - [1408, 64, 1, 256] - - [783, 1884.7] + - [786, 1884.7] - - [64, 1856, 1, 128] - - [760, 888.105] + - [763, 888.105] - - [4, 256, 1, 128] - - [877, 7.28178] + - [880, 7.28178] - - [512, 16, 1, 512] - - [790, 663.656] + - [793, 663.656] - - [704, 5888, 1, 128] - - [824, 4424.45] + - [827, 4424.45] - - [6784, 3584, 1, 128] - - [826, 3823.3] + - [829, 3823.3] - - [1024, 64, 1, 256] - - [768, 1379.71] + - [771, 1379.71] - - [64, 2368, 1, 256] - - [842, 2424.83] + - [845, 2424.83] - - [5124, 1500, 1, 2048] - - [860, 8391.74] + - [863, 8391.74] - - [4288, 5056, 1, 3328] - - [846, 9274.04] + - [849, 9274.04] - - [4, 1856, 1, 1280] - - [790, 453.374] + - [793, 453.374] - - [4288, 128, 1, 128] - - [824, 2157.7] + - [827, 2157.7] - - [512, 2, 1, 500000] - - [734, 516.795] + - [737, 516.795] - - [1408, 1408, 1, 128] - - [825, 3600.39] + - [828, 3600.39] - - [7680, 16, 1, 2560] - - [805, 3542.49] + - [808, 3542.49] - - [1856, 128, 1, 128] - - [757, 1532.7] + - [760, 1532.7] - - [5056, 2368, 1, 256] - - [869, 7683.97] + - [872, 7683.97] - - [4288, 704, 1, 3328] - - [842, 7642.86] + - [845, 7642.86] - - [448, 3584, 1, 256] - - [852, 6733.97] + - [855, 6733.97] - - [2368, 64, 1, 1280] - - [803, 3962.14] + - [806, 3962.14] - - [2368, 1024, 1, 1280] - - [854, 7989.54] + - [857, 7989.54] - - [2944, 1408, 1, 3328] - - [857, 8954.56] + - [860, 8954.56] - - [6144, 1500, 1, 2560] - - [875, 8169.97] + - [878, 8169.97] - - [4224, 1, 1, 128] - - [806, 76.8] + - [809, 76.8] - - [1024, 1408, 1, 3328] - - [872, 6961.28] + - [875, 6961.28] - - [2944, 5888, 1, 1280] - - [853, 8797.43] + - [856, 8797.43] - - [8448, 2, 1, 2816] - - [728, 496.858] + - [731, 496.858] - - [1408, 4, 1, 1280] - - [883, 471.791] + - [886, 471.791] - - [5888, 3584, 1, 256] - - [860, 8246.2] + - [863, 8246.2] - - [2368, 5056, 1, 128] - - [823, 4906.8] + - [826, 4906.8] - - [1408, 1856, 1, 3328] - - [847, 9006.7] + - [850, 9006.7] - - [4, 4, 1, 3328] - - [795, 5.73793] + - [798, 5.73793] - - [5888, 5056, 1, 3328] - - [860, 8545.0] + - [863, 8545.0] - - [7680, 6000, 1, 2560] - - [853, 7995.9] + - [856, 7995.9] - - [6784, 1408, 1, 1280] - - [847, 8888.03] + - [850, 8888.03] - - [4, 1024, 1, 1280] - - [795, 302.009] + - [798, 302.009] - - [512, 3000, 1, 2560] - - [847, 7809.33] + - [850, 7809.33] - - [704, 2944, 1, 256] - - [852, 4909.14] + - [855, 4909.14] - - [4288, 64, 1, 256] - - [852, 3264.62] + - [855, 3264.62] - - [6784, 5888, 1, 3328] - - [860, 9544.42] + - [863, 9544.42] - - [2368, 4288, 1, 128] - - [823, 4872.93] + - [826, 4872.93] - - [64, 4288, 1, 1280] - - [809, 4656.32] + - [812, 4656.32] - - [6784, 64, 1, 1280] - - [842, 6230.33] + - [845, 6230.33] - - [3584, 128, 1, 128] - - [750, 2315.47] + - [753, 2315.47] - - [1024, 6784, 1, 128] - - [824, 3758.84] + - [827, 3758.84] - - [1024, 1500, 1, 1536] - - [873, 6971.9] + - [876, 6971.9] - - [1408, 64, 1, 3328] - - [776, 5079.48] + - [779, 5079.48] - - [6784, 4, 1, 256] - - [762, 487.838] + - [765, 487.838] - - [1408, 1408, 1, 1280] - - [875, 7423.21] + - [878, 7423.21] - - [256, 2368, 1, 256] - - [842, 4986.8] + - [845, 4986.8] - - [3072, 3000, 1, 1024] - - [844, 7843.91] + - [847, 7843.91] - - [448, 4288, 1, 3328] - - [843, 7204.69] + - [846, 7204.69] - - [2368, 1408, 1, 256] - - [875, 5897.86] + - [878, 5897.86] - - [704, 2368, 1, 256] - - [842, 7000.83] + - [845, 7000.83] - - [1024, 24000, 1, 2560] - - [869, 8562.21] + - [872, 8562.21] - - [2944, 448, 1, 1280] - - [857, 7155.83] + - [860, 7155.83] - - [5888, 2368, 1, 3328] - - [856, 9252.32] + - [859, 9252.32] - - [1024, 256, 1, 128] - - [838, 1255.78] + - [841, 1255.78] - - [5124, 9124, 1, 1760] - - [850, 9168.39] + - [853, 9168.39] - - [448, 1408, 1, 1280] - - [842, 6150.24] + - [845, 6150.24] - - [448, 1856, 1, 1280] - - [857, 6489.66] + - [860, 6489.66] - - [4288, 448, 1, 1280] - - [872, 6886.92] + - [875, 6886.92] - - [5888, 704, 1, 3328] - - [852, 8230.54] + - [855, 8230.54] - - [4, 1856, 1, 128] - - [878, 26.9964] + - [881, 26.9964] - - [5056, 256, 1, 128] - - [823, 3468.91] + - [826, 3468.91] - - [1856, 256, 1, 128] - - [824, 2534.06] + - [827, 2534.06] - - [128, 2368, 1, 256] - - [842, 3660.12] + - [845, 3660.12] - - [704, 4, 1, 256] - - [790, 134.496] + - [793, 134.496] - - [1024, 6784, 1, 3328] - - [844, 8482.65] + - [847, 8482.65] - - [1408, 5888, 1, 128] - - [824, 4644.42] + - [827, 4644.42] - - [4288, 4, 1, 128] - - [877, 35.7799] + - [880, 35.7799] - - [512, 3136, 1, 2048] - - [889, 6386.59] + - [892, 6386.59] - - [1408, 1024, 1, 256] - - [842, 5440.72] + - [845, 5440.72] - - [128, 64, 1, 256] - - [772, 379.919] + - [775, 379.919] - - [8448, 1500, 1, 2816] - - [839, 9155.82] + - [842, 9155.82] - - [256, 704, 1, 128] - - [824, 895.523] + - [827, 895.523] - - [2560, 7000, 1, 2560] - - [851, 8565.56] + - [854, 8565.56] - - [5888, 64, 1, 1280] - - [866, 5007.73] + - [869, 5007.73] - - [128, 4, 1, 3328] - - [885, 165.11] + - [888, 165.11] - - [5056, 6784, 1, 1280] - - [850, 9331.38] + - [853, 9331.38] - - [1024, 448, 1, 1280] - - [852, 6501.36] + - [855, 6501.36] - - [704, 5056, 1, 3328] - - [839, 8090.03] + - [842, 8090.03] - - [128, 5056, 1, 256] - - [852, 5537.27] + - [855, 5537.27] - - [3584, 5056, 1, 3328] - - [848, 8633.14] + - [851, 8633.14] - - [1856, 4, 1, 3328] - - [886, 582.714] + - [889, 582.714] - - [4, 2944, 1, 128] - - [877, 114.192] + - [880, 114.192] - - [2368, 2944, 1, 3328] - - [856, 8749.45] + - [859, 8749.45] - - [448, 448, 1, 1280] - - [780, 4694.83] + - [783, 4694.83] - - [128, 4, 1, 128] - - [877, 4.84734] + - [880, 4.84734] - - [2368, 3584, 1, 256] - - [856, 8418.49] + - [859, 8418.49] - - [4608, 3000, 1, 1536] - - [846, 9076.37] + - [849, 9076.37] - - [1024, 256, 1, 1280] - - [852, 5562.74] + - [855, 5562.74] - - [5056, 3584, 1, 1280] - - [846, 8364.99] + - [849, 8364.99] - - [5124, 9124, 1, 4096] - - [856, 8648.48] + - [859, 8648.48] - - [7680, 48000, 1, 2560] - - [850, 4098.16] + - [853, 4098.16] - - [1856, 704, 1, 1280] - - [842, 8140.94] + - [845, 8140.94] - - [1856, 2944, 1, 1280] - - [844, 8214.3] + - [847, 8214.3] - - [4608, 1500, 1, 1536] - - [852, 8424.43] + - [855, 8424.43] - - [1024, 48000, 1, 2816] - - [843, 8513.08] + - [846, 8513.08] - - [5124, 9124, 1, 2560] - - [860, 8641.14] + - [863, 8641.14] - - [128, 1024, 1, 256] - - [774, 2356.35] + - [777, 2356.35] - - [2944, 1408, 1, 256] - - [856, 8254.19] + - [859, 8254.19] - - [4288, 1408, 1, 3328] - - [850, 9138.39] + - [853, 9138.39] - - [3584, 64, 1, 3328] - - [763, 5629.52] + - [766, 5629.52] - - [5888, 2944, 1, 128] - - [824, 4119.23] + - [827, 4119.23] - - [2944, 1024, 1, 128] - - [826, 4002.86] + - [829, 4002.86] - - [128, 1, 1, 1024] - - [820, 19.9805] + - [823, 19.9805] - - [5124, 700, 1, 2048] - - [857, 7653.74] + - [860, 7653.74] - - [4, 4288, 1, 1280] - - [790, 587.649] + - [793, 587.649] - - [6784, 5056, 1, 128] - - [829, 4855.75] + - [832, 4855.75] - - [256, 1024, 1, 3328] - - [852, 6116.18] + - [855, 6116.18] - - [3584, 4, 1, 256] - - [764, 395.476] + - [767, 395.476] - - [1856, 64, 1, 3328] - - [779, 5732.5] + - [782, 5732.5] - - [4, 128, 1, 3328] - - [885, 162.589] + - [888, 162.589] - - [256, 12544, 1, 1024] - - [889, 7628.82] + - [892, 7628.82] - - [5888, 1408, 1, 3328] - - [850, 9524.33] + - [853, 9524.33] - - [448, 2944, 1, 128] - - [824, 3163.81] + - [827, 3163.81] - - [2368, 1856, 1, 256] - - [852, 8167.26] + - [855, 8167.26] - - [256, 5056, 1, 256] - - [842, 7292.03] + - [845, 7292.03] - - [5056, 5056, 1, 128] - - [830, 5043.89] + - [833, 5043.89] - - [448, 3584, 1, 3328] - - [847, 6839.46] + - [850, 6839.46] - - [4, 5056, 1, 3328] - - [795, 639.786] + - [798, 639.786] - - [256, 256, 1, 128] - - [754, 554.802] + - [757, 554.802] - - [5888, 256, 1, 128] - - [826, 3562.37] + - [829, 3562.37] - - [4, 5056, 1, 128] - - [877, 149.807] + - [880, 149.807] - - [448, 256, 1, 256] - - [773, 2121.4] + - [776, 2121.4] - - [704, 4, 1, 3328] - - [883, 455.819] + - [886, 455.819] - - [1408, 256, 1, 256] - - [842, 4352.58] + - [845, 4352.58] - - [3584, 1856, 1, 128] - - [833, 3933.13] + - [836, 3933.13] - - [4288, 4288, 1, 128] - - [824, 4888.51] + - [827, 4888.51] - - [1856, 1024, 1, 3328] - - [860, 8242.54] + - [863, 8242.54] - - [1856, 4288, 1, 128] - - [829, 4647.3] + - [832, 4647.3] - - [1024, 6000, 1, 2560] - - [854, 8526.65] + - [857, 8526.65] - - [1024, 5056, 1, 256] - - [839, 7343.73] + - [842, 7343.73] - - [5056, 5888, 1, 128] - - [828, 4053.4] + - [831, 4053.4] - - [2368, 1408, 1, 3328] - - [842, 8466.1] + - [845, 8466.1] - - [1024, 48000, 1, 1536] - - [860, 9487.64] + - [863, 9487.64] - - [5888, 448, 1, 256] - - [873, 6081.44] + - [876, 6081.44] - - [5888, 6784, 1, 128] - - [825, 4820.17] + - [828, 4820.17] - - [2368, 4, 1, 3328] - - [884, 620.528] + - [887, 620.528] - - [6784, 5056, 1, 1280] - - [869, 8525.4] + - [872, 8525.4] - - [5056, 704, 1, 1280] - - [839, 7932.96] + - [842, 7932.96] - - [1024, 48000, 1, 2560] - - [860, 8877.84] + - [863, 8877.84] - - [4608, 32, 1, 1536] - - [789, 3556.73] + - [792, 3556.73] - - [1024, 2368, 1, 128] - - [832, 2943.65] + - [835, 2943.65] - - [128, 704, 1, 256] - - [773, 2059.7] + - [776, 2059.7] - - [2368, 448, 1, 3328] - - [852, 5290.32] + - [855, 5290.32] - - [128, 5888, 1, 3328] - - [852, 7764.33] + - [855, 7764.33] - - [448, 128, 1, 1280] - - [803, 3373.18] + - [806, 3373.18] - - [6784, 4, 1, 3328] - - [762, 675.963] + - [765, 675.963] - - [4288, 4, 1, 1280] - - [795, 564.675] + - [798, 564.675] - - [1024, 64, 1, 3328] - - [809, 4293.38] + - [812, 4293.38] - - [3072, 48000, 1, 1024] - - [859, 7826.41] + - [862, 7826.41] - - [256, 4, 1, 128] - - [878, 4.83304] + - [881, 4.83304] - - [1024, 5888, 1, 128] - - [837, 3610.36] + - [840, 3610.36] - - [3584, 5888, 1, 128] - - [825, 4722.25] + - [828, 4722.25] - - [5056, 5888, 1, 256] - - [860, 9159.01] + - [863, 9159.01] - - [2368, 1024, 1, 256] - - [852, 7482.61] + - [855, 7482.61] - - [2944, 1856, 1, 256] - - [856, 8208.9] + - [859, 8208.9] - - [1856, 6784, 1, 1280] - - [852, 8205.33] + - [855, 8205.33] - - [64, 5056, 1, 128] - - [747, 2079.25] + - [750, 2079.25] - - [64, 6784, 1, 128] - - [747, 2437.48] + - [750, 2437.48] - - [448, 704, 1, 128] - - [823, 1506.35] + - [826, 1506.35] - - [4, 1024, 1, 128] - - [878, 17.2463] + - [881, 17.2463] - - [1408, 448, 1, 256] - - [842, 5545.35] + - [845, 5545.35] - - [1408, 704, 1, 128] - - [828, 2931.55] + - [831, 2931.55] - - [64, 256, 1, 3328] - - [814, 2816.42] + - [817, 2816.42] - - [8448, 3000, 1, 2816] - - [848, 8872.89] + - [851, 8872.89] - - [6784, 448, 1, 3328] - - [842, 7555.38] + - [845, 7555.38] - - [5056, 1856, 1, 1280] - - [840, 8652.26] + - [843, 8652.26] - - [1408, 1024, 1, 3328] - - [844, 7781.32] + - [847, 7781.32] - - [2368, 256, 1, 3328] - - [848, 5391.96] + - [851, 5391.96] - - [7680, 1500, 1, 2560] - - [846, 8919.62] + - [849, 8919.62] - - [5888, 3584, 1, 1280] - - [846, 9235.75] + - [849, 9235.75] - - [1856, 3584, 1, 3328] - - [857, 8348.73] + - [860, 8348.73] - - [5888, 128, 1, 1280] - - [842, 5928.51] + - [845, 5928.51] - - [1024, 2944, 1, 256] - - [873, 6630.17] + - [876, 6630.17] - - [448, 6784, 1, 1280] - - [854, 8332.35] + - [857, 8332.35] - - [256, 3584, 1, 1280] - - [844, 7140.09] + - [847, 7140.09] - - [448, 128, 1, 128] - - [746, 552.713] + - [749, 552.713] - - [704, 5056, 1, 256] - - [852, 7959.58] + - [855, 7959.58] - - [3584, 1024, 1, 3328] - - [844, 8386.74] + - [847, 8386.74] - - [2944, 1856, 1, 1280] - - [860, 7670.19] + - [863, 7670.19] - - [128, 256, 1, 128] - - [761, 258.27] + - [764, 258.27] - - [5056, 256, 1, 256] - - [852, 5736.67] + - [855, 5736.67] - - [2944, 4288, 1, 3328] - - [839, 8730.7] + - [842, 8730.7] - - [2368, 3584, 1, 3328] - - [841, 8437.61] + - [844, 8437.61] - - [2944, 704, 1, 1280] - - [852, 8342.43] + - [855, 8342.43] - - [128, 4, 1, 256] - - [772, 24.8242] + - [775, 24.8242] - - [2944, 3584, 1, 1280] - - [854, 8322.01] + - [857, 8322.01] - - [1856, 5888, 1, 1280] - - [839, 8911.81] + - [842, 8911.81] - - [256, 256, 1, 1280] - - [803, 3653.57] + - [806, 3653.57] - - [4608, 24000, 1, 1536] - - [853, 8930.96] + - [856, 8930.96] - - [4288, 1408, 1, 256] - - [840, 8338.35] + - [843, 8338.35] - - [3584, 64, 1, 256] - - [852, 3413.97] + - [855, 3413.97] - - [64, 1856, 1, 3328] - - [779, 5460.13] + - [782, 5460.13] - - [256, 1408, 1, 128] - - [823, 1423.99] + - [826, 1423.99] - - [5888, 1408, 1, 128] - - [834, 4177.78] + - [837, 4177.78] - - [4288, 2368, 1, 1280] - - [843, 8595.95] + - [846, 8595.95] - - [4, 4288, 1, 256] - - [879, 370.854] + - [882, 370.854] - - [256, 4288, 1, 128] - - [824, 2907.89] + - [827, 2907.89] - - [256, 128, 1, 3328] - - [817, 3644.78] + - [820, 3644.78] - - [512, 8, 1, 500000] - - [729, 2025.79] + - [732, 2025.79] - - [6784, 2368, 1, 256] - - [842, 8470.31] + - [845, 8470.31] - - [5888, 128, 1, 128] - - [747, 2604.45] + - [750, 2604.45] - - [1408, 448, 1, 3328] - - [852, 6540.52] + - [855, 6540.52] - - [1024, 24000, 1, 2816] - - [869, 8363.93] + - [872, 8363.93] - - [704, 1024, 1, 1280] - - [852, 7277.18] + - [855, 7277.18] - - [1856, 256, 1, 3328] - - [842, 7039.04] + - [845, 7039.04] - - [1856, 2944, 1, 256] - - [851, 8151.49] + - [854, 8151.49] - - [5056, 1024, 1, 128] - - [825, 4422.72] + - [828, 4422.72] - - [64, 5888, 1, 1280] - - [803, 4854.52] + - [806, 4854.52] - - [7680, 3000, 1, 2560] - - [856, 8789.47] + - [859, 8789.47] - - [4224, 1500, 1, 176] - - [852, 7902.04] + - [855, 7902.04] - - [5124, 700, 1, 2560] - - [842, 8232.49] + - [845, 8232.49] - - [6784, 256, 1, 128] - - [823, 3548.82] + - [826, 3548.82] - - [5888, 704, 1, 128] - - [830, 3959.55] + - [833, 3959.55] - - [6784, 64, 1, 128] - - [758, 2150.72] + - [761, 2150.72] - - [4, 448, 1, 1280] - - [883, 267.963] + - [886, 267.963] - - [1024, 4288, 1, 1280] - - [857, 8363.62] + - [860, 8363.62] - - [2368, 5056, 1, 3328] - - [856, 8581.75] + - [859, 8581.75] - - [448, 4, 1, 128] - - [877, 16.7673] + - [880, 16.7673] - - [4, 256, 1, 3328] - - [886, 201.888] + - [889, 201.888] - - [4288, 1024, 1, 3328] - - [852, 8567.62] + - [855, 8567.62] - - [6144, 48000, 1, 2560] - - [860, 3751.58] + - [863, 3751.58] - - [1024, 5056, 1, 3328] - - [839, 9440.56] + - [842, 9440.56] - - [1024, 1856, 1, 3328] - - [860, 8244.26] + - [863, 8244.26] - - [704, 704, 1, 1280] - - [852, 5529.89] + - [855, 5529.89] - - [128, 2368, 1, 1280] - - [809, 5062.28] + - [812, 5062.28] - - [3584, 4, 1, 128] - - [878, 61.4949] + - [881, 61.4949] - - [3584, 256, 1, 1280] - - [876, 6260.14] + - [879, 6260.14] - - [4, 128, 1, 128] - - [877, 1.1587] + - [880, 1.1587] - - [128, 4288, 1, 3328] - - [788, 6186.05] + - [791, 6186.05] - - [5124, 1500, 1, 2560] - - [856, 8432.52] + - [859, 8432.52] - - [3584, 128, 1, 1280] - - [842, 6547.75] + - [845, 6547.75] - - [4, 256, 1, 1280] - - [795, 180.044] + - [798, 180.044] - - [128, 704, 1, 3328] - - [767, 5177.71] + - [770, 5177.71] - - [4288, 6784, 1, 256] - - [840, 9005.24] + - [843, 9005.24] - - [3584, 2944, 1, 3328] - - [857, 8872.17] + - [860, 8872.17] - - [128, 1856, 1, 256] - - [842, 3690.38] + - [845, 3690.38] - - [64, 4288, 1, 256] - - [842, 3007.47] + - [845, 3007.47] - - [4, 3584, 1, 3328] - - [772, 639.89] + - [775, 639.89] - - [64, 4, 1, 3328] - - [886, 98.6074] + - [889, 98.6074] - - [4, 64, 1, 3328] - - [886, 91.8069] + - [889, 91.8069] - - [35, 700, 1, 2560] - - [740, 2397.55] + - [743, 2397.55] - - [5888, 2944, 1, 256] - - [850, 9031.18] + - [853, 9031.18] - - [4, 2368, 1, 256] - - [790, 256.868] + - [793, 256.868] - - [1856, 64, 1, 256] - - [774, 2222.86] + - [777, 2222.86] - - [5056, 128, 1, 1280] - - [842, 6557.75] + - [845, 6557.75] - - [448, 4288, 1, 1280] - - [866, 6891.56] + - [869, 6891.56] - - [256, 4288, 1, 256] - - [842, 6250.41] + - [845, 6250.41] - - [1024, 4288, 1, 128] - - [826, 3951.31] + - [829, 3951.31] - - [4, 1024, 1, 256] - - [790, 182.044] + - [793, 182.044] - - [5056, 4288, 1, 256] - - [846, 8933.33] + - [849, 8933.33] - - [1024, 448, 1, 256] - - [852, 4573.23] + - [855, 4573.23] - - [1024, 3584, 1, 256] - - [847, 7447.08] + - [850, 7447.08] - - [2944, 128, 1, 1280] - - [852, 5417.17] + - [855, 5417.17] - - [2560, 32, 1, 2560] - - [789, 4076.89] + - [792, 4076.89] - - [64, 256, 1, 256] - - [806, 689.853] + - [809, 689.853] - - [1024, 4, 1, 512] - - [798, 288.07] + - [801, 288.07] - - [128, 2368, 1, 128] - - [752, 1809.58] + - [755, 1809.58] - - [256, 704, 1, 1280] - - [842, 4032.98] + - [845, 4032.98] - - [64, 2368, 1, 128] - - [743, 1165.78] + - [746, 1165.78] - - [176, 1500, 1, 1408] - - [770, 4922.03] + - [773, 4922.03] - - [448, 5888, 1, 1280] - - [852, 7550.11] + - [855, 7550.11] - - [512, 3000, 1, 2048] - - [874, 6562.34] + - [877, 6562.34] - - [5056, 448, 1, 128] - - [824, 3947.87] + - [827, 3947.87] - - [4288, 704, 1, 1280] - - [842, 8243.72] + - [845, 8243.72] - - [3584, 2944, 1, 128] - - [834, 4284.78] + - [837, 4284.78] - - [6784, 256, 1, 1280] - - [842, 7955.11] + - [845, 7955.11] - - [256, 2944, 1, 1280] - - [872, 6691.8] + - [875, 6691.8] - - [2560, 128, 1, 2560] - - [810, 5347.13] + - [813, 5347.13] - - [2368, 5888, 1, 3328] - - [847, 8918.97] + - [850, 8918.97] - - [4, 64, 1, 256] - - [795, 13.0032] + - [798, 13.0032] - - [704, 1024, 1, 3328] - - [872, 6648.02] + - [875, 6648.02] - - [2368, 1856, 1, 1280] - - [858, 8016.41] + - [861, 8016.41] - - [448, 5056, 1, 3328] - - [842, 8231.63] + - [845, 8231.63] - - [128, 448, 1, 128] - - [751, 441.108] + - [754, 441.108] - - [128, 6784, 1, 256] - - [852, 5849.95] + - [855, 5849.95] - - [512, 4, 1, 500000] - - [732, 1027.04] + - [735, 1027.04] - - [3584, 4288, 1, 128] - - [828, 4260.8] + - [831, 4260.8] - - [64, 448, 1, 128] - - [751, 253.454] + - [754, 253.454] - - [1024, 6000, 1, 2816] - - [856, 8886.04] + - [859, 8886.04] - - [5888, 4288, 1, 3328] - - [856, 8968.06] + - [859, 8968.06] - - [2368, 704, 1, 256] - - [872, 4663.14] + - [875, 4663.14] - - [256, 1856, 1, 3328] - - [844, 6480.53] + - [847, 6480.53] - - [1856, 128, 1, 256] - - [842, 3726.56] + - [845, 3726.56] - - [6784, 128, 1, 128] - - [745, 2823.91] + - [748, 2823.91] - - [3584, 1408, 1, 128] - - [828, 3666.68] + - [831, 3666.68] - - [1856, 5056, 1, 1280] - - [839, 8651.26] + - [842, 8651.26] - - [2944, 1024, 1, 1280] - - [850, 8765.11] + - [853, 8765.11] - - [5056, 4, 1, 256] - - [764, 428.588] + - [767, 428.588] - - [3584, 5888, 1, 3328] - - [850, 9347.65] + - [853, 9347.65] - - [2368, 4288, 1, 256] - - [860, 8013.0] + - [863, 8013.0] - - [1024, 2368, 1, 3328] - - [847, 8119.19] + - [850, 8119.19] - - [128, 3584, 1, 128] - - [747, 2584.52] + - [750, 2584.52] - - [704, 1408, 1, 256] - - [852, 6792.17] + - [855, 6792.17] - - [4096, 128, 1, 4096] - - [874, 6624.74] + - [877, 6624.74] - - [1024, 2944, 1, 128] - - [826, 3771.27] + - [829, 3771.27] - - [1024, 3584, 1, 1280] - - [847, 8952.61] + - [850, 8952.61] - - [4288, 5888, 1, 3328] - - [860, 9047.95] + - [863, 9047.95] - - [4288, 4, 1, 3328] - - [765, 615.106] + - [768, 615.106] - - [4608, 16, 1, 1536] - - [769, 2894.84] + - [772, 2894.84] - - [5888, 64, 1, 128] - - [756, 1827.06] + - [759, 1827.06] - - [4, 5888, 1, 128] - - [877, 179.444] + - [880, 179.444] - - [1024, 2944, 1, 3328] - - [848, 8298.67] + - [851, 8298.67] - - [2048, 64, 1, 2048] - - [777, 4963.67] + - [780, 4963.67] - - [6144, 2, 1, 2560] - - [766, 477.78] + - [769, 477.78] - - [256, 6784, 1, 1280] - - [840, 7491.84] + - [843, 7491.84] - - [1856, 3584, 1, 256] - - [852, 7580.5] + - [855, 7580.5] - - [128, 448, 1, 3328] - - [803, 4417.61] + - [806, 4417.61] - - [6784, 1856, 1, 128] - - [831, 4621.64] + - [834, 4621.64] - - [1024, 1500, 1, 2048] - - [852, 6284.4] + - [855, 6284.4] - - [5056, 128, 1, 256] - - [852, 5705.06] + - [855, 5705.06] - - [512, 24000, 1, 2816] - - [839, 8919.75] + - [842, 8919.75] - - [256, 5888, 1, 1280] - - [854, 7977.9] + - [857, 7977.9] - - [4, 128, 1, 1280] - - [795, 94.1609] + - [798, 94.1609] - - [4288, 6784, 1, 3328] - - [860, 9012.48] + - [863, 9012.48] - - [6784, 128, 1, 1280] - - [844, 6807.25] + - [847, 6807.25] - - [64, 1408, 1, 256] - - [773, 2045.09] + - [776, 2045.09] - - [2368, 1408, 1, 128] - - [824, 4340.63] + - [827, 4340.63] - - [1856, 448, 1, 256] - - [873, 3639.89] + - [876, 3639.89] - - [1408, 1024, 1, 128] - - [832, 3417.58] + - [835, 3417.58] - - [128, 64, 1, 128] - - [753, 68.6241] + - [756, 68.6241] - - [6784, 3584, 1, 3328] - - [850, 9425.53] + - [853, 9425.53] - - [1760, 7000, 1, 1760] - - [847, 8780.31] + - [850, 8780.31] - - [1024, 704, 1, 3328] - - [864, 5644.5] + - [867, 5644.5] - - [64, 64, 1, 128] - - [743, 38.1023] + - [746, 38.1023] - - [2368, 5056, 1, 1280] - - [861, 8462.31] + - [864, 8462.31] - - [64, 4, 1, 1280] - - [795, 46.5455] + - [798, 46.5455] - - [1408, 2368, 1, 1280] - - [847, 8234.98] + - [850, 8234.98] - - [128, 1408, 1, 1280] - - [809, 4491.56] + - [812, 4491.56] - - [1024, 1, 1, 512] - - [813, 81.92] + - [816, 81.92] - - [4, 1408, 1, 128] - - [877, 56.32] + - [880, 56.32] - - [704, 4288, 1, 128] - - [831, 3942.86] + - [834, 3942.86] - - [128, 1856, 1, 3328] - - [797, 6111.83] + - [800, 6111.83] - - [2944, 2944, 1, 256] - - [856, 8640.12] + - [859, 8640.12] - - [2944, 4, 1, 1280] - - [790, 554.165] + - [793, 554.165] - - [5888, 4, 1, 256] - - [772, 435.644] + - [775, 435.644] - - [6784, 256, 1, 256] - - [852, 7025.86] + - [855, 7025.86] - - [256, 5056, 1, 3328] - - [852, 8249.47] + - [855, 8249.47] - - [128, 4288, 1, 1280] - - [842, 5561.64] + - [845, 5561.64] - - [5056, 1856, 1, 128] - - [836, 3975.18] + - [839, 3975.18] - - [1024, 3000, 1, 1536] - - [857, 8544.44] + - [860, 8544.44] - - [5056, 1024, 1, 3328] - - [850, 9361.37] + - [853, 9361.37] - - [128, 128, 1, 256] - - [802, 699.051] + - [805, 699.051] - - [1760, 64, 1, 1760] - - [770, 4956.16] + - [773, 4956.16] - - [4288, 3584, 1, 3328] - - [870, 7506.08] + - [873, 7506.08] - - [448, 704, 1, 3328] - - [842, 4697.56] + - [845, 4697.56] - - [448, 448, 1, 128] - - [759, 1249.52] + - [762, 1249.52] - - [1024, 2368, 1, 1280] - - [852, 7756.34] + - [855, 7756.34] - - [1856, 704, 1, 3328] - - [852, 8340.56] + - [855, 8340.56] - - [512, 1500, 1, 2560] - - [854, 6041.29] + - [857, 6041.29] - - [5888, 6784, 1, 3328] - - [850, 9199.28] + - [853, 9199.28] - - [704, 4288, 1, 1280] - - [844, 8341.96] + - [847, 8341.96] - - [128, 50176, 1, 512] - - [890, 7589.38] + - [893, 7589.38] - - [704, 256, 1, 256] - - [842, 2912.71] + - [845, 2912.71] - - [1024, 48000, 1, 2048] - - [847, 8947.32] + - [850, 8947.32] - - [4288, 1024, 1, 128] - - [823, 4291.65] + - [826, 4291.65] - - [3136, 64, 128, 64] - - [905, 8175.06] + - [908, 8175.06] - - [784, 128, 128, 512] - - [904, 8190.53] + - [907, 8190.53] - - [784, 512, 256, 128] - - [902, 8637.14] + - [905, 8637.14] - - [3136, 256, 256, 64] - - [902, 8663.08] + - [905, 8663.08] - - [3136, 64, 128, 256] - - [900, 8943.46] + - [903, 8943.46] - - [3136, 64, 256, 64] - - [905, 8267.12] + - [908, 8267.12] - - [784, 512, 128, 128] - - [902, 8564.25] + - [905, 8564.25] - - [784, 128, 256, 512] - - [906, 8377.06] + - [909, 8377.06] - - [3136, 64, 256, 256] - - [907, 9033.88] + - [910, 9033.88] - - [3136, 256, 128, 64] - - [902, 8624.46] + - [905, 8624.46] - - [1024, 256, 1, 1024] - - [928, 6331.03] + - [931, 6331.03] - - [1024, 512, 1, 2048] - - [927, 8100.04] + - [930, 8100.04] - - [512, 200, 1, 512] - - [936, 2861.83] + - [939, 2861.83] - - [4096, 256, 1, 2048] - - [919, 8812.72] + - [922, 8812.72] - - [4096, 512, 1, 1024] - - [929, 9068.77] + - [932, 9068.77] - - [1024, 200, 1, 1024] - - [928, 5110.02] + - [931, 5110.02] - - [1024, 512, 1, 1024] - - [921, 7785.25] + - [924, 7785.25] - - [2048, 256, 1, 4096] - - [931, 8438.71] + - [934, 8438.71] - - [2048, 768, 1, 512] - - [913, 8618.43] + - [916, 8618.43] - - [512, 256, 1, 1024] - - [933, 4834.93] + - [936, 4834.93] - - [512, 768, 1, 2048] - - [930, 6908.94] + - [933, 6908.94] - - [2048, 256, 1, 1024] - - [926, 7941.88] + - [929, 7941.88] - - [1024, 256, 1, 2048] - - [923, 6997.8] + - [926, 6997.8] - - [2048, 200, 1, 512] - - [926, 5649.66] + - [929, 5649.66] - - [4096, 200, 1, 1024] - - [924, 6678.83] + - [927, 6678.83] - - [2048, 200, 1, 4096] - - [932, 6706.59] + - [935, 6706.59] - - [2048, 512, 1, 1024] - - [929, 8548.9] + - [932, 8548.9] - - [1024, 1024, 1, 512] - - [924, 8046.63] + - [927, 8046.63] - - [1024, 200, 1, 4096] - - [923, 5884.26] + - [926, 5884.26] - - [2048, 512, 1, 4096] - - [934, 8995.84] + - [937, 8995.84] - - [4096, 512, 1, 2048] - - [929, 9298.08] + - [932, 9298.08] - - [4096, 1024, 1, 2048] - - [911, 9790.67] + - [914, 9790.67] - - [2048, 1024, 1, 2048] - - [912, 9278.8] + - [915, 9278.8] - - [1024, 200, 1, 512] - - [928, 4535.36] + - [931, 4535.36] - - [1024, 1024, 1, 4096] - - [919, 8967.29] + - [922, 8967.29] - - [2048, 1024, 1, 4096] - - [914, 9500.46] + - [917, 9500.46] - - [4096, 200, 1, 2048] - - [920, 7082.58] + - [923, 7082.58] - - [2048, 200, 1, 1024] - - [926, 6211.94] + - [929, 6211.94] - - [1024, 768, 1, 512] - - [927, 7401.71] + - [930, 7401.71] - - [2048, 512, 1, 512] - - [924, 8124.56] + - [927, 8124.56] - - [2048, 200, 1, 2048] - - [926, 6561.8] + - [929, 6561.8] - - [2048, 256, 1, 2048] - - [927, 8224.13] + - [930, 8224.13] - - [512, 768, 1, 512] - - [925, 6469.36] + - [928, 6469.36] - - [512, 200, 1, 1024] - - [928, 3755.64] + - [931, 3755.64] - - [4096, 1024, 1, 1024] - - [911, 9605.85] + - [914, 9605.85] - - [4096, 256, 1, 4096] - - [934, 8961.29] + - [937, 8961.29] - - [1024, 512, 1, 512] - - [927, 7108.99] + - [930, 7108.99] - - [512, 256, 1, 512] - - [935, 4032.98] + - [938, 4032.98] - - [1024, 256, 1, 4096] - - [923, 7326.3] + - [926, 7326.3] - - [1024, 200, 1, 2048] - - [916, 5530.46] + - [919, 5530.46] - - [2048, 1024, 1, 512] - - [917, 8995.83] + - [920, 8995.83] - - [1024, 1024, 1, 2048] - - [924, 8830.11] + - [927, 8830.11] - - [4096, 256, 1, 1024] - - [924, 8581.7] + - [927, 8581.7] - - [512, 768, 1, 1024] - - [925, 6875.91] + - [928, 6875.91] - - [1024, 512, 1, 4096] - - [921, 8484.05] + - [924, 8484.05] - - [1024, 256, 1, 512] - - [918, 5667.98] + - [921, 5667.98] - - [4096, 200, 1, 4096] - - [931, 7018.59] + - [934, 7018.59] - - [2048, 256, 1, 512] - - [931, 7078.99] + - [934, 7078.99] - - [512, 200, 1, 2048] - - [936, 4283.4] + - [939, 4283.4] - - [1024, 1024, 1, 1024] - - [919, 8565.27] + - [922, 8565.27] - - [2048, 512, 1, 2048] - - [919, 8850.49] + - [922, 8850.49] - - [4096, 1024, 1, 4096] - - [912, 9843.18] + - [915, 9843.18] - - [2048, 1024, 1, 1024] - - [917, 9234.11] + - [920, 9234.11] - - [4096, 384, 1, 2048] - - [959, 8892.52] + - [962, 8892.52] - - [4096, 192, 1, 2048] - - [953, 8024.18] + - [956, 8024.18] - - [1225, 192, 64, 384] - - [942, 9373.83] + - [945, 9373.83] - - [5329, 64, 64, 160] - - [946, 9186.69] + - [949, 9186.69] - - [1225, 64, 64, 384] - - [941, 8735.76] + - [944, 8735.76] - - [289, 128, 64, 1024] - - [956, 7000.2] + - [959, 7000.2] - - [4096, 320, 1, 1280] - - [961, 8302.26] + - [964, 8302.26] - - [4096, 384, 1, 1536] - - [943, 9052.45] + - [946, 9052.45] - - [4096, 192, 1, 1280] - - [958, 7561.85] + - [961, 7561.85] - - [289, 192, 64, 1024] - - [952, 7346.99] + - [955, 7346.99] - - [1225, 96, 64, 384] - - [939, 8303.08] + - [942, 8303.08] - - [4096, 320, 1, 2048] - - [948, 8384.42] + - [951, 8384.42] - - [4096, 256, 1, 1536] - - [960, 8734.34] + - [963, 8734.34] - - [4096, 384, 1, 1280] - - [957, 9023.24] + - [960, 9023.24] - - [4096, 448, 1, 1280] - - [948, 8343.32] + - [951, 8343.32] - - [289, 256, 64, 1024] - - [951, 7535.46] + - [954, 7535.46] - - [4096, 448, 1, 2048] - - [948, 8572.31] + - [951, 8572.31] - - [289, 384, 64, 1024] - - [949, 7767.57] + - [952, 7767.57] - - [1024, 3594, 1, 4096] - - [968, 8661.42] + - [971, 8661.42] - - [4096, 3103, 1, 1024] - - [978, 9652.13] + - [981, 9652.13] - - [4096, 3136, 1, 1024] - - [962, 9723.05] + - [965, 9723.05] - - [1024, 3141, 1, 4096] - - [980, 8612.02] + - [983, 8612.02] - - [64, 147, 432, 148] - - [995, 6371.93] + - [998, 6371.93] - - [4096, 3559, 1, 1024] - - [967, 9906.25] + - [970, 9906.25] - - [4096, 3368, 1, 1024] - - [962, 9720.91] + - [965, 9720.91] - - [1024, 3335, 1, 4096] - - [986, 8990.19] + - [989, 8990.19] - - [1024, 3510, 1, 4096] - - [986, 9440.58] + - [989, 9440.58] - - [4096, 3209, 1, 1024] - - [967, 9632.66] + - [970, 9632.66] - - [4096, 3322, 1, 1024] - - [966, 9939.42] + - [969, 9939.42] - - [1024, 3400, 1, 4096] - - [985, 9155.99] + - [988, 9155.99] - - [1024, 3995, 1, 4096] - - [968, 9610.15] + - [971, 9610.15] - - [1024, 3503, 1, 4096] - - [986, 9446.47] + - [989, 9446.47] - - [4096, 3594, 1, 1024] - - [977, 9691.86] + - [980, 9691.86] - - [4096, 3473, 1, 1024] - - [966, 9698.8] + - [969, 9698.8] - - [4096, 3522, 1, 1024] - - [967, 9816.82] + - [970, 9816.82] - - [1024, 3103, 1, 4096] - - [964, 8490.95] + - [967, 8490.95] - - [1024, 3214, 1, 4096] - - [985, 8667.57] + - [988, 8667.57] - - [4096, 3449, 1, 1024] - - [977, 9795.61] + - [980, 9795.61] - - [1024, 3136, 1, 4096] - - [986, 8500.51] + - [989, 8500.51] - - [1024, 3955, 1, 33708] - - [966, 9634.84] + - [969, 9634.84] - - [1024, 3780, 1, 4096] - - [969, 9088.78] + - [972, 9088.78] - - [1024, 3906, 1, 33708] - - [967, 9515.36] + - [970, 9515.36] - - [1024, 3386, 1, 4096] - - [986, 9115.95] + - [989, 9115.95] - - [4096, 3396, 1, 1024] - - [977, 9665.5] + - [980, 9665.5] - - [1024, 3183, 1, 4096] - - [964, 8662.84] + - [967, 8662.84] - - [1024, 3098, 1, 4096] - - [980, 8490.12] + - [983, 8490.12] - - [1024, 3548, 1, 4096] - - [986, 9555.53] + - [989, 9555.53] - - [1024, 3224, 1, 4096] - - [979, 8760.78] + - [982, 8760.78] - - [4096, 3469, 1, 1024] - - [966, 9687.11] + - [969, 9687.11] - - [1024, 3582, 1, 4096] - - [983, 9690.9] + - [986, 9690.9] - - [1024, 2977, 1, 4096] - - [968, 9379.28] + - [971, 9379.28] - - [1024, 3939, 1, 1024] - - [965, 9172.01] + - [968, 9172.01] - - [64, 123, 528, 123] - - [1013, 6346.07] + - [1016, 6346.07] - - [64, 12, 5040, 12] - - [990, 1536.0] + - [993, 1536.0] - - [4096, 3176, 1, 1024] - - [978, 9712.1] + - [981, 9712.1] - - [1024, 3559, 1, 4096] - - [982, 9579.74] + - [985, 9579.74] - - [1024, 3478, 1, 4096] - - [986, 9373.75] + - [989, 9373.75] - - [4096, 3343, 1, 1024] - - [962, 9638.67] + - [965, 9638.67] - - [4096, 3440, 1, 1024] - - [962, 9853.86] + - [965, 9853.86] - - [1024, 3996, 1, 33708] - - [966, 9733.45] + - [969, 9733.45] - - [1024, 4012, 1, 4096] - - [967, 9636.89] + - [970, 9636.89] - - [1024, 3322, 1, 4096] - - [986, 8945.02] + - [989, 8945.02] - - [1024, 3990, 1, 33708] - - [966, 9720.21] + - [969, 9720.21] - - [1024, 3314, 1, 4096] - - [986, 8944.62] + - [989, 8944.62] - - [4096, 3513, 1, 1024] - - [966, 9794.85] + - [969, 9794.85] - - [1024, 3562, 1, 4096] - - [986, 9597.18] + - [989, 9597.18] - - [1024, 3443, 1, 4096] - - [986, 9279.42] + - [989, 9279.42] - - [1024, 3554, 1, 4096] - - [983, 9552.06] + - [986, 9552.06] - - [1024, 3063, 1, 4096] - - [968, 9622.48] + - [971, 9622.48] - - [64, 111, 576, 112] - - [1013, 6274.55] + - [1016, 6274.55] - - [4096, 3460, 1, 1024] - - [966, 9665.59] + - [969, 9665.59] - - [1024, 3209, 1, 4096] - - [965, 8708.29] + - [968, 8708.29] - - [1024, 3147, 1, 4096] - - [986, 8492.13] + - [989, 8492.13] - - [4096, 3387, 1, 1024] - - [963, 9761.24] + - [966, 9761.24] - - [4096, 3436, 1, 1024] - - [962, 9815.05] + - [965, 9815.05] - - [1024, 3341, 1, 4096] - - [985, 9004.97] + - [988, 9004.97] - - [1024, 3516, 1, 4096] - - [985, 9471.29] + - [988, 9471.29] - - [4096, 3277, 1, 1024] - - [966, 9807.02] + - [969, 9807.02] - - [1024, 3454, 1, 4096] - - [986, 9300.93] + - [989, 9300.93] - - [1024, 3969, 1, 4096] - - [966, 9539.72] + - [969, 9539.72] - - [1024, 3999, 1, 4096] - - [967, 9607.42] + - [970, 9607.42] - - [1024, 4032, 1, 4096] - - [968, 9693.37] + - [971, 9693.37] - - [4096, 3541, 1, 1024] - - [967, 9866.63] + - [970, 9866.63] - - [4096, 3334, 1, 1024] - - [978, 9614.31] + - [981, 9614.31] - - [1024, 3365, 1, 4096] - - [986, 9058.48] + - [989, 9058.48] - - [1024, 3527, 1, 4096] - - [986, 9510.21] + - [989, 9510.21] - - [1024, 3190, 1, 4096] - - [985, 8627.7] + - [988, 8627.7] - - [4096, 3906, 1, 1024] - - [963, 9817.68] + - [966, 9817.68] - - [1024, 3593, 1, 4096] - - [968, 8662.99] + - [971, 8662.99] - - [1024, 3336, 1, 4096] - - [986, 8991.03] + - [989, 8991.03] - - [4096, 3504, 1, 1024] - - [966, 9769.76] + - [969, 9769.76] - - [4096, 3977, 1, 1024] - - [967, 9742.52] + - [970, 9742.52] - - [1024, 3906, 1, 4096] - - [967, 9386.15] + - [970, 9386.15] - - [4096, 3415, 1, 1024] - - [977, 9802.6] + - [980, 9802.6] - - [1024, 3295, 1, 4096] - - [985, 8879.16] + - [988, 8879.16] - - [4096, 3321, 1, 1024] - - [967, 9931.33] + - [970, 9931.33] - - [1024, 3072, 1, 4096] - - [968, 9671.61] + - [971, 9671.61] - - [1024, 3408, 1, 4096] - - [985, 9182.73] + - [988, 9182.73] - - [1024, 3522, 1, 4096] - - [986, 9484.53] + - [989, 9484.53] - - [4096, 3751, 1, 1024] - - [967, 9778.76] + - [970, 9778.76] - - [4096, 3378, 1, 1024] - - [977, 9692.67] + - [980, 9692.67] - - [64, 77, 816, 77] - - [1019, 4850.19] + - [1022, 4850.19] - - [1024, 3925, 1, 33708] - - [966, 9560.78] + - [969, 9560.78] - - [1024, 3990, 1, 1024] - - [968, 9272.65] + - [971, 9272.65] - - [1024, 3290, 1, 4096] - - [979, 8905.51] + - [982, 8905.51] - - [4096, 3500, 1, 1024] - - [967, 9761.72] + - [970, 9761.72] - - [4096, 3565, 1, 1024] - - [966, 9919.27] + - [969, 9919.27] - - [1024, 3484, 1, 4096] - - [985, 9376.42] + - [988, 9376.42] - - [4096, 3395, 1, 1024] - - [978, 9788.06] + - [981, 9788.06] - - [64, 92, 688, 92] - - [1005, 5606.0] + - [1008, 5606.0] - - [1024, 3681, 1, 1024] - - [970, 8690.13] + - [973, 8690.13] - - [64, 159, 400, 159] - - [997, 6518.87] + - [1000, 6518.87] - - [1024, 3584, 1, 1024] - - [985, 9365.27] + - [988, 9365.27] - - [4096, 3093, 1, 1024] - - [977, 9623.31] + - [980, 9623.31] - - [1024, 4050, 1, 1024] - - [969, 9354.04] + - [972, 9354.04] - - [1024, 3301, 1, 4096] - - [986, 8888.94] + - [989, 8888.94] - - [1024, 3581, 1, 4096] - - [985, 9673.72] + - [988, 9673.72] - - [4096, 3374, 1, 1024] - - [978, 9707.23] + - [981, 9707.23] - - [1024, 3449, 1, 4096] - - [986, 9270.8] + - [989, 9270.8] - - [4096, 3215, 1, 1024] - - [967, 9645.15] + - [970, 9645.15] - - [4096, 3312, 1, 1024] - - [967, 9888.62] + - [970, 9888.62] - - [4096, 3479, 1, 1024] - - [967, 9698.51] + - [970, 9698.51] - - [4096, 3544, 1, 1024] - - [967, 9874.99] + - [970, 9874.99] - - [1024, 3263, 1, 4096] - - [986, 8787.51] + - [989, 8787.51] - - [4096, 3455, 1, 1024] - - [977, 9845.19] + - [980, 9845.19] - - [1024, 3379, 1, 4096] - - [983, 9099.91] + - [986, 9099.91] - - [1024, 3490, 1, 4096] - - [986, 9397.39] + - [989, 9397.39] - - [1024, 3368, 1, 4096] - - [986, 9079.15] + - [989, 9079.15] - - [4096, 3186, 1, 1024] - - [962, 9750.07] + - [965, 9750.07] - - [1024, 3428, 1, 4096] - - [986, 9232.82] + - [989, 9232.82] - - [64, 85, 752, 84] - - [1001, 5342.57] + - [1004, 5342.57] - - [4096, 3561, 1, 1024] - - [967, 9913.92] + - [970, 9913.92] - - [4096, 3418, 1, 1024] - - [977, 9765.76] + - [980, 9765.76] - - [1024, 3064, 1, 4096] - - [968, 9621.58] + - [971, 9621.58] - - [4096, 3259, 1, 1024] - - [967, 9765.42] + - [970, 9765.42] - - [4096, 3308, 1, 1024] - - [966, 9900.36] + - [969, 9900.36] - - [1024, 3533, 1, 4096] - - [986, 9520.02] + - [989, 9520.02] - - [1024, 3344, 1, 4096] - - [986, 9014.45] + - [989, 9014.45] - - [1024, 4030, 1, 1024] - - [968, 9354.0] + - [971, 9354.0] - - [4096, 3459, 1, 1024] - - [967, 9656.1] + - [970, 9656.1] - - [1024, 3572, 1, 4096] - - [983, 9639.97] + - [986, 9639.97] - - [1024, 3925, 1, 1024] - - [979, 9173.64] + - [982, 9173.64] - - [4096, 3435, 1, 1024] - - [962, 9778.1] + - [965, 9778.1] - - [1024, 3956, 1, 4096] - - [969, 9498.46] + - [972, 9498.46] - - [1024, 3463, 1, 4096] - - [986, 9332.36] + - [989, 9332.36] - - [4096, 3182, 1, 1024] - - [977, 9826.74] + - [980, 9826.74] - - [4096, 3976, 1, 1024] - - [977, 9741.89] + - [980, 9741.89] - - [1024, 3417, 1, 4096] - - [986, 9208.87] + - [989, 9208.87] - - [1024, 3528, 1, 4096] - - [986, 9508.99] + - [989, 9508.99] - - [4096, 3446, 1, 1024] - - [977, 9816.87] + - [980, 9816.87] - - [64, 122, 528, 123] - - [1013, 6325.88] + - [1016, 6325.88] - - [1024, 3543, 1, 4096] - - [986, 9538.63] + - [989, 9538.63] - - [4096, 3287, 1, 1024] - - [966, 9845.94] + - [969, 9845.94] - - [1024, 3499, 1, 4096] - - [986, 9428.41] + - [989, 9428.41] - - [1024, 3231, 1, 4096] - - [979, 8769.81] + - [982, 8769.81] - - [64, 17, 3632, 17] - - [1001, 1934.84] + - [1004, 1934.84] - - [4096, 3519, 1, 1024] - - [966, 9804.28] + - [969, 9804.28] - - [4096, 3552, 1, 1024] - - [966, 9892.55] + - [969, 9892.55] - - [1024, 3458, 1, 4096] - - [986, 9312.18] + - [989, 9312.18] - - [64, 93, 688, 92] - - [1005, 5660.12] + - [1008, 5660.12] - - [1024, 3374, 1, 4096] - - [980, 9110.31] + - [983, 9110.31] - - [1024, 3396, 1, 4096] - - [986, 9145.69] + - [989, 9145.69] - - [1024, 2967, 1, 4096] - - [968, 9364.66] + - [971, 9364.66] - - [64, 19, 3264, 19] - - [1005, 2142.37] + - [1008, 2142.37] - - [4096, 3482, 1, 1024] - - [966, 9714.1] + - [969, 9714.1] - - [64, 32, 1984, 32] - - [1016, 3619.81] + - [1019, 3619.81] - - [64, 102, 624, 99] - - [1007, 5515.23] + - [1010, 5515.23] - - [1024, 3226, 1, 4096] - - [965, 8790.37] + - [968, 8790.37] - - [4096, 3377, 1, 1024] - - [963, 9683.98] + - [966, 9683.98] - - [4096, 3426, 1, 1024] - - [978, 9869.84] + - [981, 9869.84] - - [4096, 2935, 1, 1024] - - [978, 9762.01] + - [981, 9762.01] - - [64, 133, 480, 133] - - [1017, 5891.22] + - [1020, 5891.22] - - [1024, 3439, 1, 4096] - - [986, 9253.89] + - [989, 9253.89] - - [4096, 3267, 1, 1024] - - [966, 9783.8] + - [969, 9783.8] - - [4096, 3499, 1, 1024] - - [967, 9761.01] + - [970, 9761.01] - - [4096, 3356, 1, 1024] - - [978, 9679.34] + - [981, 9679.34] - - [64, 232, 272, 232] - - [1021, 7180.93] + - [1024, 7180.93] - - [64, 162, 400, 159] - - [981, 6444.53] + - [984, 6444.53] - - [4096, 3939, 1, 1024] - - [977, 9877.9] + - [980, 9877.9] - - [1024, 3526, 1, 4096] - - [986, 9508.0] + - [989, 9508.0] - - [1024, 3859, 1, 33708] - - [967, 9402.03] + - [970, 9402.03] - - [1024, 3385, 1, 4096] - - [985, 9107.18] + - [988, 9107.18] - - [1024, 3496, 1, 4096] - - [986, 9417.9] + - [989, 9417.9] - - [4096, 3141, 1, 1024] - - [978, 9682.44] + - [981, 9682.44] - - [4096, 3510, 1, 1024] - - [966, 9786.49] + - [969, 9786.49] - - [1024, 3434, 1, 4096] - - [986, 9246.6] + - [989, 9246.6] - - [4096, 3969, 1, 1024] - - [966, 9714.75] + - [969, 9714.75] - - [1024, 3121, 1, 4096] - - [964, 8464.22] + - [967, 8464.22] - - [1024, 3232, 1, 4096] - - [986, 8711.63] + - [989, 8711.63] - - [1024, 4030, 1, 33708] - - [967, 9816.21] + - [970, 9816.21] - - [1024, 3780, 1, 33708] - - [975, 9315.44] + - [978, 9315.44] - - [1024, 3969, 1, 1024] - - [964, 9248.44] + - [967, 9248.44] - - [4096, 3527, 1, 1024] - - [966, 9832.84] + - [969, 9832.84] - - [4096, 3336, 1, 1024] - - [963, 9623.25] + - [966, 9623.25] - - [4096, 3290, 1, 1024] - - [966, 9852.11] + - [969, 9852.11] - - [64, 9, 6544, 9] - - [1006, 1068.14] + - [1009, 1068.14] - - [1024, 3469, 1, 4096] - - [986, 9350.45] + - [989, 9350.45] - - [4096, 3490, 1, 1024] - - [966, 9737.46] + - [969, 9737.46] - - [4096, 3064, 1, 1024] - - [966, 9889.92] + - [969, 9889.92] - - [4096, 3582, 1, 1024] - - [967, 9961.28] + - [970, 9961.28] - - [1024, 3956, 1, 1024] - - [964, 9294.15] + - [967, 9294.15] - - [4096, 3417, 1, 1024] - - [962, 9811.56] + - [965, 9811.56] - - [1024, 2736, 1, 4096] - - [968, 8636.6] + - [971, 8636.6] - - [64, 78, 816, 78] - - [1005, 4946.0] + - [1008, 4946.0] - - [1024, 3205, 1, 4096] - - [980, 8657.11] + - [983, 8657.11] - - [1024, 3143, 1, 4096] - - [980, 8567.77] + - [983, 8567.77] - - [1024, 4020, 1, 4096] - - [968, 9664.52] + - [971, 9664.52] - - [1024, 3318, 1, 4096] - - [965, 8966.95] + - [968, 8966.95] - - [4096, 3364, 1, 1024] - - [978, 9697.08] + - [981, 9697.08] - - [1024, 3353, 1, 4096] - - [986, 9034.07] + - [989, 9034.07] - - [1024, 3464, 1, 4096] - - [986, 9325.95] + - [989, 9325.95] - - [4096, 3205, 1, 1024] - - [966, 9619.0] + - [969, 9619.0] - - [4096, 3318, 1, 1024] - - [967, 9932.56] + - [970, 9932.56] - - [1024, 3402, 1, 4096] - - [985, 9153.39] + - [988, 9153.39] - - [4096, 3181, 1, 1024] - - [977, 9789.05] + - [980, 9789.05] - - [4096, 3550, 1, 1024] - - [967, 9888.03] + - [970, 9888.03] - - [4096, 3445, 1, 1024] - - [977, 9752.55] + - [980, 9752.55] - - [1024, 3138, 1, 4096] - - [963, 8484.0] + - [966, 8484.0] - - [64, 99, 624, 99] - - [1013, 5323.89] + - [1016, 5323.89] - - [4096, 3079, 1, 1024] - - [963, 9562.16] + - [966, 9562.16] - - [4096, 3144, 1, 1024] - - [977, 9686.56] + - [980, 9686.56] - - [4096, 3860, 1, 1024] - - [978, 9733.32] + - [981, 9733.32] - - [1024, 3515, 1, 4096] - - [986, 9478.34] + - [989, 9478.34] - - [4096, 3408, 1, 1024] - - [963, 9764.86] + - [966, 9764.86] - - [64, 101, 624, 102] - - [1013, 5482.69] + - [1016, 5482.69] - - [1024, 3181, 1, 4096] - - [965, 8593.16] + - [968, 8593.16] - - [4096, 3298, 1, 1024] - - [967, 9867.62] + - [970, 9867.62] - - [4096, 3585, 1, 1024] - - [977, 9632.91] + - [980, 9632.91] - - [1024, 3550, 1, 4096] - - [986, 9564.36] + - [989, 9564.36] - - [1024, 4020, 1, 1024] - - [969, 9339.05] + - [972, 9339.05] - - [4096, 3481, 1, 1024] - - [967, 9713.9] + - [970, 9713.9] - - [4096, 3530, 1, 1024] - - [967, 9833.89] + - [970, 9833.89] - - [4096, 3425, 1, 1024] - - [963, 9675.56] + - [966, 9675.56] - - [4096, 4026, 1, 1024] - - [967, 9849.67] + - [970, 9849.67] - - [1024, 3860, 1, 1024] - - [980, 9073.49] + - [983, 9073.49] - - [4096, 3975, 1, 1024] - - [967, 9737.62] + - [970, 9737.62] - - [1024, 3286, 1, 4096] - - [964, 8884.14] + - [967, 8884.14] - - [1024, 3176, 1, 4096] - - [964, 8597.38] + - [967, 8597.38] - - [1024, 3894, 1, 4096] - - [968, 9359.03] + - [971, 9359.03] - - [4096, 3355, 1, 1024] - - [977, 9692.99] + - [980, 9692.99] - - [4096, 3404, 1, 1024] - - [977, 9786.02] + - [980, 9786.02] - - [1024, 3501, 1, 4096] - - [985, 9426.04] + - [988, 9426.04] - - [4096, 3245, 1, 1024] - - [967, 9723.47] + - [970, 9723.47] - - [1024, 3431, 1, 4096] - - [983, 9244.22] + - [986, 9244.22] - - [1024, 4000, 1, 1024] - - [979, 9343.93] + - [982, 9343.93] - - [4096, 3509, 1, 1024] - - [966, 9781.62] + - [969, 9781.62] - - [4096, 3558, 1, 1024] - - [967, 9905.05] + - [970, 9905.05] - - [1024, 3535, 1, 4096] - - [985, 9519.05] + - [988, 9519.05] - - [1024, 3414, 1, 4096] - - [983, 9197.95] + - [986, 9197.95] - - [1024, 3445, 1, 4096] - - [986, 9279.56] + - [989, 9279.56] - - [1024, 3436, 1, 4096] - - [986, 9259.6] + - [989, 9259.6] - - [4096, 3472, 1, 1024] - - [967, 9685.17] + - [970, 9685.17] - - [1024, 3211, 1, 4096] - - [965, 8708.31] + - [968, 8708.31] - - [64, 7, 8192, 7] - - [1002, 802.816] + - [1005, 802.816] - - [4096, 3383, 1, 1024] - - [977, 9734.72] + - [980, 9734.72] - - [4096, 3448, 1, 1024] - - [978, 9828.44] + - [981, 9828.44] - - [1024, 3343, 1, 4096] - - [979, 9010.36] + - [982, 9010.36] - - [1024, 3518, 1, 4096] - - [986, 9467.92] + - [989, 9467.92] - - [4096, 3289, 1, 1024] - - [967, 9844.06] + - [970, 9844.06] - - [1024, 3440, 1, 4096] - - [982, 9269.42] + - [985, 9269.42] - - [1024, 4032, 1, 33708] - - [966, 9822.31] + - [969, 9822.31] - - [4096, 3489, 1, 1024] - - [966, 9741.93] + - [969, 9741.93] - - [4096, 3346, 1, 1024] - - [963, 9616.64] + - [966, 9616.64] - - [1024, 3534, 1, 4096] - - [985, 9524.19] + - [988, 9524.19] - - [1024, 3079, 1, 4096] - - [980, 8397.67] + - [983, 8397.67] - - [1024, 3955, 1, 4096] - - [967, 9492.15] + - [970, 9492.15] - - [4096, 3236, 1, 1024] - - [967, 9705.93] + - [970, 9705.93] - - [1024, 3545, 1, 4096] - - [985, 9551.87] + - [988, 9551.87] - - [1024, 3144, 1, 4096] - - [979, 8556.7] + - [982, 8556.7] - - [4096, 3780, 1, 1024] - - [966, 9847.5] + - [969, 9847.5] - - [4096, 3163, 1, 1024] - - [977, 9717.69] + - [980, 9717.69] - - [4096, 3468, 1, 1024] - - [967, 9686.39] + - [970, 9686.39] - - [1024, 3539, 1, 4096] - - [986, 9526.89] + - [989, 9526.89] - - [1024, 3541, 1, 4096] - - [986, 9532.76] + - [989, 9532.76] - - [4096, 3363, 1, 1024] - - [962, 9699.0] + - [965, 9699.0] - - [1024, 3475, 1, 4096] - - [986, 9357.0] + - [989, 9357.0] - - [4096, 3110, 1, 1024] - - [978, 9659.58] + - [981, 9659.58] - - [1024, 3509, 1, 4096] - - [985, 9450.49] + - [988, 9450.49] - - [1024, 3413, 1, 4096] - - [986, 9185.81] + - [989, 9185.81] - - [1024, 3975, 1, 1024] - - [964, 9315.42] + - [967, 9315.42] - - [4096, 3549, 1, 1024] - - [967, 9884.72] + - [970, 9884.72] - - [4096, 3342, 1, 1024] - - [977, 9644.27] + - [980, 9644.27] - - [1024, 2985, 1, 4096] - - [967, 9392.07] + - [970, 9392.07] - - [1024, 3876, 1, 33708] - - [966, 9442.22] + - [969, 9442.22] - - [4096, 3280, 1, 1024] - - [966, 9819.92] + - [969, 9819.92] - - [4096, 3191, 1, 1024] - - [978, 9862.08] + - [981, 9862.08] - - [4096, 3512, 1, 1024] - - [967, 9793.11] + - [970, 9793.11] - - [1024, 3560, 1, 4096] - - [983, 9555.45] + - [986, 9555.45] - - [4096, 2499, 1, 1024] - - [967, 9669.35] + - [970, 9669.35] - - [1024, 3248, 1, 4096] - - [964, 8811.84] + - [967, 8811.84] - - [4096, 3423, 1, 1024] - - [978, 9729.67] + - [981, 9729.67] - - [64, 111, 576, 111] - - [1013, 5982.63] + - [1016, 5982.63] - - [4096, 3297, 1, 1024] - - [966, 9865.19] + - [969, 9865.19] - - [4096, 3154, 1, 1024] - - [978, 9613.42] + - [981, 9613.42] - - [1024, 3303, 1, 4096] - - [965, 8951.79] + - [968, 8951.79] - - [1024, 3222, 1, 4096] - - [985, 8682.89] + - [988, 8682.89] - - [1024, 3978, 1, 1024] - - [969, 9234.93] + - [972, 9234.93] - - [4096, 3529, 1, 1024] - - [967, 9831.62] + - [970, 9831.62] - - [4096, 3386, 1, 1024] - - [977, 9755.67] + - [980, 9755.67] - - [64, 134, 480, 134] - - [992, 5990.53] + - [995, 5990.53] - - [1024, 3451, 1, 4096] - - [983, 9277.61] + - [986, 9277.61] - - [4096, 3562, 1, 1024] - - [967, 9908.82] + - [970, 9908.82] - - [4096, 3276, 1, 1024] - - [966, 9818.04] + - [969, 9818.04] - - [64, 135, 480, 132] - - [1021, 6071.77] + - [1024, 6071.77] - - [1024, 3894, 1, 33708] - - [966, 9487.79] + - [969, 9487.79] - - [64, 134, 480, 132] - - [1020, 6091.65] + - [1023, 6091.65] - - [4096, 3540, 1, 1024] - - [967, 9862.79] + - [970, 9862.79] - - [1024, 3416, 1, 4096] - - [985, 9206.17] + - [988, 9206.17] - - [1024, 4005, 1, 33708] - - [966, 9757.19] + - [969, 9757.19] - - [1024, 3942, 1, 4096] - - [969, 9455.75] + - [972, 9455.75] - - [4096, 3403, 1, 1024] - - [977, 9739.36] + - [980, 9739.36] - - [4096, 3381, 1, 1024] - - [978, 9760.04] + - [981, 9760.04] - - [1024, 3492, 1, 4096] - - [982, 9391.69] + - [985, 9391.69] - - [4096, 3101, 1, 1024] - - [978, 9625.92] + - [981, 9625.92] - - [1024, 3430, 1, 4096] - - [986, 9232.04] + - [989, 9232.04] - - [1024, 3977, 1, 4096] - - [969, 9562.9] + - [972, 9562.9] - - [1024, 3640, 1, 4096] - - [968, 8761.4] + - [971, 8761.4] - - [4096, 3557, 1, 1024] - - [967, 9905.42] + - [970, 9905.42] - - [4096, 3414, 1, 1024] - - [963, 9755.39] + - [966, 9755.39] - - [1024, 3391, 1, 4096] - - [986, 9142.56] + - [989, 9142.56] - - [64, 134, 480, 135] - - [995, 5922.05] + - [998, 5922.05] - - [64, 16, 3840, 16] - - [1011, 2080.51] + - [1014, 2080.51] - - [1024, 3356, 1, 4096] - - [986, 9050.99] + - [989, 9050.99] - - [4096, 3320, 1, 1024] - - [967, 9929.47] + - [970, 9929.47] - - [4096, 2765, 1, 1024] - - [967, 9750.18] + - [970, 9750.18] - - [64, 162, 400, 162] - - [984, 6515.19] + - [987, 6515.19] - - [1024, 3411, 1, 4096] - - [986, 9185.62] + - [989, 9185.62] - - [1024, 3978, 1, 4096] - - [966, 9562.67] + - [969, 9562.67] - - [4096, 3487, 1, 1024] - - [967, 9733.75] + - [970, 9733.75] - - [4096, 3520, 1, 1024] - - [966, 9813.85] + - [969, 9813.85] - - [4096, 3942, 1, 1024] - - [977, 9804.29] + - [980, 9804.29] - - [4096, 3431, 1, 1024] - - [962, 9818.96] + - [965, 9818.96] - - [1024, 3271, 1, 4096] - - [979, 8912.98] + - [982, 8912.98] - - [4096, 4020, 1, 1024] - - [966, 9831.32] + - [969, 9831.32] - - [1024, 3481, 1, 4096] - - [982, 9376.05] + - [985, 9376.05] - - [1024, 3419, 1, 4096] - - [985, 9208.58] + - [988, 9208.58] - - [1024, 4059, 1, 4096] - - [969, 9733.73] + - [972, 9733.73] - - [4096, 3345, 1, 1024] - - [978, 9651.33] + - [981, 9651.33] - - [4096, 3394, 1, 1024] - - [978, 9780.33] + - [981, 9780.33] - - [1024, 3298, 1, 4096] - - [985, 8889.53] + - [988, 8889.53] - - [4096, 3235, 1, 1024] - - [967, 9705.71] + - [970, 9705.71] - - [1024, 3681, 1, 33708] - - [974, 9146.12] + - [977, 9146.12] - - [1024, 3840, 1, 4096] - - [967, 9253.85] + - [970, 9253.85] - - [1024, 3362, 1, 4096] - - [986, 9059.71] + - [989, 9059.71] - - [4096, 3467, 1, 1024] - - [966, 9677.41] + - [969, 9677.41] - - [1024, 3349, 1, 4096] - - [986, 9033.97] + - [989, 9033.97] - - [1024, 3460, 1, 4096] - - [986, 9322.84] + - [989, 9322.84] - - [4096, 3214, 1, 1024] - - [967, 9644.36] + - [970, 9644.36] - - [1024, 3398, 1, 4096] - - [986, 9157.19] + - [989, 9157.19] - - [4096, 3478, 1, 1024] - - [966, 9706.56] + - [969, 9706.56] - - [1024, 4050, 1, 33708] - - [966, 9865.04] + - [969, 9865.04] - - [1024, 3244, 1, 4096] - - [982, 8744.43] + - [985, 8744.43] - - [4096, 3341, 1, 1024] - - [978, 9646.69] + - [981, 9646.69] - - [4096, 3454, 1, 1024] - - [963, 9880.46] + - [966, 9880.46] - - [1024, 3166, 1, 4096] - - [980, 8618.36] + - [983, 8618.36] - - [1024, 3425, 1, 4096] - - [986, 9225.22] + - [989, 9225.22] - - [4096, 3295, 1, 1024] - - [967, 9863.71] + - [970, 9863.71] - - [4096, 3072, 1, 1024] - - [966, 9970.99] + - [969, 9970.99] - - [4096, 3822, 1, 1024] - - [967, 9951.97] + - [970, 9951.97] - - [1024, 3681, 1, 4096] - - [968, 8856.84] + - [971, 8856.84] - - [1024, 4050, 1, 4096] - - [968, 9717.48] + - [971, 9717.48] - - [4096, 3495, 1, 1024] - - [966, 9741.04] + - [969, 9741.04] - - [4096, 3560, 1, 1024] - - [967, 9909.04] + - [970, 9909.04] - - [1024, 3524, 1, 4096] - - [985, 9503.1] + - [988, 9503.1] - - [1024, 3942, 1, 33708] - - [966, 9602.57] + - [969, 9602.57] - - [1024, 3304, 1, 4096] - - [965, 8928.66] + - [968, 8928.66] - - [1024, 3387, 1, 4096] - - [986, 9127.55] + - [989, 9127.55] - - [1024, 3498, 1, 4096] - - [985, 9423.29] + - [988, 9423.29] - - [4096, 3458, 1, 1024] - - [966, 9642.53] + - [969, 9642.53] - - [4096, 2967, 1, 1024] - - [966, 9626.61] + - [969, 9626.61] - - [64, 8, 7280, 8] - - [988, 1032.51] + - [991, 1032.51] - - [4096, 3385, 1, 1024] - - [962, 9735.67] + - [965, 9735.67] - - [4096, 3434, 1, 1024] - - [977, 9808.8] + - [980, 9808.8] - - [1024, 3519, 1, 4096] - - [986, 9484.73] + - [989, 9484.73] - - [1024, 3511, 1, 4096] - - [986, 9456.37] + - [989, 9456.37] - - [1024, 3288, 1, 4096] - - [985, 8863.95] + - [988, 8863.95] - - [1024, 2918, 1, 4096] - - [968, 9170.25] + - [971, 9170.25] - - [4096, 3573, 1, 1024] - - [967, 9945.75] + - [970, 9945.75] - - [1024, 3822, 1, 33708] - - [976, 9330.9] + - [979, 9330.9] - - [64, 102, 624, 102] - - [1013, 5531.07] + - [1016, 5531.07] - - [4096, 3539, 1, 1024] - - [967, 9855.29] + - [970, 9855.29] - - [4096, 3332, 1, 1024] - - [978, 9648.87] + - [981, 9648.87] - - [4096, 3286, 1, 1024] - - [967, 9846.32] + - [970, 9846.32] - - [1024, 4026, 1, 4096] - - [968, 9675.84] + - [971, 9675.84] - - [1024, 3277, 1, 4096] - - [982, 8836.11] + - [985, 8836.11] - - [1024, 3471, 1, 4096] - - [986, 9346.23] + - [989, 9346.23] - - [4096, 3518, 1, 1024] - - [967, 9804.1] + - [970, 9804.1] - - [1024, 3393, 1, 4096] - - [986, 9148.89] + - [989, 9148.89] - - [4096, 3413, 1, 1024] - - [963, 9785.07] + - [966, 9785.07] - - [4096, 3303, 1, 1024] - - [967, 9884.27] + - [970, 9884.27] - - [1024, 3207, 1, 4096] - - [964, 8714.59] + - [967, 8714.59] - - [1024, 3894, 1, 1024] - - [980, 9181.41] + - [983, 9181.41] - - [1024, 3977, 1, 1024] - - [980, 9240.8] + - [983, 9240.8] - - [64, 135, 480, 133] - - [995, 5923.3] + - [998, 5923.3] - - [4096, 3535, 1, 1024] - - [967, 9839.45] + - [970, 9839.45] - - [4096, 3376, 1, 1024] - - [962, 9711.92] + - [965, 9711.92] - - [1024, 3355, 1, 4096] - - [986, 9043.17] + - [989, 9043.17] - - [64, 27, 2336, 27] - - [1014, 2929.8] + - [1017, 2929.8] - - [1024, 3466, 1, 4096] - - [986, 9339.0] + - [989, 9339.0] - - [4096, 3266, 1, 1024] - - [967, 9789.19] + - [970, 9789.19] - - [1024, 3404, 1, 4096] - - [986, 9176.66] + - [989, 9176.66] - - [1024, 3999, 1, 1024] - - [979, 9391.81] + - [982, 9391.81] - - [64, 148, 432, 143] - - [992, 6182.82] + - [995, 6182.82] - - [4096, 3498, 1, 1024] - - [966, 9764.46] + - [969, 9764.46] - - [1024, 4032, 1, 1024] - - [964, 9401.93] + - [967, 9401.93] - - [1024, 3410, 1, 4096] - - [985, 9183.4] + - [988, 9183.4] - - [4096, 3393, 1, 1024] - - [978, 9695.39] + - [981, 9695.39] - - [1024, 3140, 1, 4096] - - [979, 8504.76] + - [982, 8504.76] - - [1024, 3910, 1, 33708] - - [966, 9525.96] + - [969, 9525.96] - - [1024, 3334, 1, 4096] - - [985, 8987.49] + - [988, 8987.49] - - [4096, 3140, 1, 1024] - - [978, 9660.61] + - [981, 9660.61] - - [1024, 4005, 1, 4096] - - [969, 9629.78] + - [972, 9629.78] - - [1024, 3579, 1, 4096] - - [985, 9661.35] + - [988, 9661.35] - - [4096, 3372, 1, 1024] - - [978, 9697.22] + - [981, 9697.22] - - [1024, 3245, 1, 4096] - - [979, 8847.66] + - [982, 8847.66] - - [64, 38, 1680, 38] - - [989, 3340.34] + - [992, 3340.34] - - [4096, 3956, 1, 1024] - - [978, 9911.05] + - [981, 9911.05] - - [4096, 3213, 1, 1024] - - [966, 9643.01] + - [969, 9643.01] - - [1024, 3361, 1, 4096] - - [986, 9062.14] + - [989, 9062.14] - - [1024, 3536, 1, 4096] - - [985, 9530.55] + - [988, 9530.55] - - [1024, 3968, 1, 1024] - - [980, 9377.82] + - [983, 9377.82] - - [4096, 3477, 1, 1024] - - [967, 9700.67] + - [970, 9700.67] - - [4096, 3526, 1, 1024] - - [967, 9824.31] + - [970, 9824.31] - - [1024, 4005, 1, 1024] - - [964, 9362.29] + - [967, 9362.29] - - [1024, 3530, 1, 4096] - - [983, 9487.07] + - [986, 9487.07] - - [1024, 3944, 1, 4096] - - [968, 9464.45] + - [971, 9464.45] - - [4096, 3453, 1, 1024] - - [977, 9826.67] + - [980, 9826.67] - - [4096, 3184, 1, 1024] - - [978, 9833.49] + - [981, 9833.49] - - [4096, 3579, 1, 1024] - - [967, 9962.45] + - [970, 9962.45] - - [4096, 3351, 1, 1024] - - [978, 9653.24] + - [981, 9653.24] - - [4096, 3416, 1, 1024] - - [962, 9810.3] + - [965, 9810.3] - - [64, 100, 624, 100] - - [1013, 5408.45] + - [1016, 5408.45] - - [1024, 3822, 1, 4096] - - [968, 9196.1] + - [971, 9196.1] - - [1024, 3796, 1, 4096] - - [968, 9131.86] + - [971, 9131.86] - - [4096, 3257, 1, 1024] - - [966, 9767.24] + - [969, 9767.24] - - [4096, 3306, 1, 1024] - - [966, 9893.25] + - [969, 9893.25] - - [1024, 3505, 1, 4096] - - [986, 9449.92] + - [989, 9449.92] - - [1024, 3315, 1, 4096] - - [979, 8979.67] + - [982, 8979.67] - - [1024, 3486, 1, 4096] - - [985, 9393.38] + - [988, 9393.38] - - [4096, 3457, 1, 1024] - - [966, 9653.09] + - [969, 9653.09] - - [4096, 3870, 1, 1024] - - [963, 9717.41] + - [966, 9717.41] - - [1024, 3447, 1, 4096] - - [986, 9273.04] + - [989, 9273.04] - - [1024, 3558, 1, 4096] - - [983, 9567.23] + - [986, 9567.23] - - [4096, 3433, 1, 1024] - - [963, 9759.16] + - [966, 9759.16] - - [4096, 3180, 1, 1024] - - [978, 9738.53] + - [981, 9738.53] - - [1024, 3213, 1, 4096] - - [964, 8692.15] + - [967, 8692.15] - - [1024, 3900, 1, 4096] - - [968, 9388.51] + - [971, 9388.51] - - [4096, 3444, 1, 1024] - - [977, 9869.63] + - [980, 9869.63] - - [1024, 3504, 1, 4096] - - [986, 9429.28] + - [989, 9429.28] - - [4096, 4059, 1, 1024] - - [967, 9920.69] + - [970, 9920.69] - - [1024, 3442, 1, 4096] - - [986, 9272.91] + - [989, 9272.91] - - [4096, 3517, 1, 1024] - - [966, 9808.09] + - [969, 9808.09] - - [1024, 3566, 1, 4096] - - [985, 9622.79] + - [988, 9622.79] - - [4096, 3248, 1, 1024] - - [966, 9730.23] + - [969, 9730.23] - - [1024, 3547, 1, 4096] - - [985, 9564.63] + - [988, 9564.63] - - [64, 59, 1088, 59] - - [1004, 4611.66] + - [1007, 4611.66] - - [1024, 3340, 1, 4096] - - [985, 8992.11] + - [988, 8992.11] - - [4096, 3480, 1, 1024] - - [967, 9710.07] + - [970, 9710.07] - - [1024, 3968, 1, 4096] - - [967, 9543.01] + - [970, 9543.01] - - [4096, 3424, 1, 1024] - - [963, 9808.56] + - [966, 9808.56] - - [1024, 3906, 1, 1024] - - [965, 9150.44] + - [968, 9150.44] - - [4096, 3265, 1, 1024] - - [966, 9786.75] + - [969, 9786.75] - - [1024, 3384, 1, 4096] - - [986, 9119.46] + - [989, 9119.46] - - [1024, 3494, 1, 4096] - - [983, 9415.42] + - [986, 9415.42] - - [1024, 3236, 1, 4096] - - [980, 8767.04] + - [983, 8767.04] - - [4096, 3497, 1, 1024] - - [967, 9750.76] + - [970, 9750.76] - - [4096, 3354, 1, 1024] - - [978, 9665.07] + - [981, 9665.07] - - [4096, 3055, 1, 1024] - - [967, 9883.99] + - [970, 9883.99] - - [64, 11, 5456, 11] - - [990, 1368.24] + - [993, 1368.24] - - [4096, 3244, 1, 1024] - - [966, 9719.92] + - [969, 9719.92] - - [4096, 3139, 1, 1024] - - [977, 9736.96] + - [980, 9736.96] - - [4096, 3508, 1, 1024] - - [966, 9771.56] + - [969, 9771.56] - - [4096, 4050, 1, 1024] - - [966, 9898.69] + - [969, 9898.69] - - [1024, 3472, 1, 4096] - - [985, 9353.73] + - [988, 9353.73] - - [1024, 3861, 1, 1024] - - [964, 9061.22] + - [967, 9061.22] - - [1024, 3910, 1, 1024] - - [968, 9043.44] + - [971, 9043.44] - - [4096, 3371, 1, 1024] - - [978, 9738.14] + - [981, 9738.14] - - [64, 65, 992, 65] - - [1017, 4354.49] + - [1020, 4354.49] - - [1024, 3751, 1, 4096] - - [967, 9018.64] + - [970, 9018.64] - - [4096, 3325, 1, 1024] - - [966, 9958.63] + - [969, 9958.63] - - [1024, 3321, 1, 4096] - - [986, 8952.45] + - [989, 8952.45] - - [1024, 3944, 1, 1024] - - [965, 9117.25] + - [968, 9117.25] - - [4096, 3525, 1, 1024] - - [967, 9822.04] + - [970, 9822.04] - - [4096, 3382, 1, 1024] - - [978, 9720.11] + - [981, 9720.11] - - [64, 122, 528, 122] - - [1013, 6389.23] + - [1016, 6389.23] - - [1024, 3453, 1, 4096] - - [983, 9304.93] + - [986, 9304.93] - - [4096, 3564, 1, 1024] - - [966, 9911.22] + - [969, 9911.22] - - [4096, 3288, 1, 1024] - - [966, 9841.07] + - [969, 9841.07] - - [1024, 3925, 1, 4096] - - [967, 9418.85] + - [970, 9418.85] - - [1024, 3057, 1, 4096] - - [968, 9590.41] + - [971, 9590.41] - - [4096, 3488, 1, 1024] - - [967, 9732.4] + - [970, 9732.4] - - [4096, 3046, 1, 1024] - - [967, 9850.62] + - [970, 9850.62] - - [1024, 3189, 1, 4096] - - [979, 8676.92] + - [982, 8676.92] - - [4096, 3399, 1, 1024] - - [963, 9672.99] + - [966, 9672.99] - - [1024, 3383, 1, 4096] - - [986, 9102.27] + - [989, 9102.27] - - [1024, 3415, 1, 4096] - - [986, 9216.27] + - [989, 9216.27] - - [1024, 3388, 1, 4096] - - [986, 9127.43] + - [989, 9127.43] - - [1024, 3376, 1, 4096] - - [983, 9090.43] + - [986, 9090.43] - - [1024, 3473, 1, 4096] - - [986, 9354.02] + - [989, 9354.02] - - [4096, 3162, 1, 1024] - - [962, 9694.73] + - [965, 9694.73] - - [1024, 3448, 1, 4096] - - [986, 9283.35] + - [989, 9283.35] - - [4096, 3362, 1, 1024] - - [978, 9673.23] + - [981, 9673.23] - - [64, 228, 272, 228] - - [971, 7039.03] + - [974, 7039.03] - - [1024, 3262, 1, 4096] - - [980, 8850.74] + - [983, 8850.74] - - [1024, 3184, 1, 4096] - - [965, 8625.27] + - [968, 8625.27] - - [1024, 3378, 1, 4096] - - [985, 9105.17] + - [988, 9105.17] - - [4096, 3548, 1, 1024] - - [966, 9877.73] + - [969, 9877.73] - - [4096, 2977, 1, 1024] - - [966, 9647.71] + - [969, 9647.71] - - [64, 21, 2976, 21] - - [1001, 2364.71] + - [1004, 2364.71] - - [64, 112, 576, 111] - - [1000, 5973.58] + - [1003, 5973.58] - - [4096, 3443, 1, 1024] - - [962, 9784.4] + - [965, 9784.4] - - [1024, 3289, 1, 4096] - - [986, 8873.94] + - [989, 8873.94] - - [1024, 3483, 1, 4096] - - [982, 9380.47] + - [985, 9380.47] - - [4096, 3190, 1, 1024] - - [978, 9850.86] + - [981, 9850.86] - - [1024, 3421, 1, 4096] - - [986, 9213.96] + - [989, 9213.96] - - [1024, 3514, 1, 4096] - - [985, 9458.13] + - [988, 9458.13] - - [1024, 3532, 1, 4096] - - [986, 9512.93] + - [989, 9512.93] - - [1024, 3565, 1, 4096] - - [985, 9630.5] + - [988, 9630.5] - - [4096, 3422, 1, 1024] - - [963, 9733.69] + - [966, 9733.69] - - [4096, 3263, 1, 1024] - - [967, 9776.84] + - [970, 9776.84] - - [4096, 3296, 1, 1024] - - [967, 9860.51] + - [970, 9860.51] - - [4096, 3640, 1, 1024] - - [977, 9782.2] + - [980, 9782.2] - - [4096, 3463, 1, 1024] - - [966, 9671.9] + - [969, 9671.9] - - [4096, 3528, 1, 1024] - - [967, 9829.88] + - [970, 9829.88] - - [1024, 3351, 1, 4096] - - [980, 9054.27] + - [983, 9054.27] - - [1024, 3462, 1, 4096] - - [986, 9327.75] + - [989, 9327.75] - - [4096, 3226, 1, 1024] - - [967, 9674.83] + - [970, 9674.83] - - [4096, 3439, 1, 1024] - - [962, 9823.08] + - [965, 9823.08] - - [4096, 3121, 1, 1024] - - [962, 9672.54] + - [965, 9672.54] - - [1024, 4059, 1, 33708] - - [966, 9885.62] + - [969, 9885.62] - - [1024, 3311, 1, 4096] - - [986, 8909.91] + - [989, 8909.91] - - [1024, 3230, 1, 4096] - - [986, 8705.8] + - [989, 8705.8] - - [4096, 3353, 1, 1024] - - [978, 9671.76] + - [981, 9671.76] - - [4096, 3402, 1, 1024] - - [963, 9726.94] + - [966, 9726.94] - - [1024, 3427, 1, 4096] - - [986, 9233.45] + - [989, 9233.45] - - [1024, 3346, 1, 4096] - - [986, 9015.67] + - [989, 9015.67] - - [1024, 3126, 1, 4096] - - [980, 8519.21] + - [983, 8519.21] - - [1024, 3796, 1, 1024] - - [964, 8916.65] + - [967, 8916.65] - - [1024, 3990, 1, 4096] - - [968, 9600.76] + - [971, 9600.76] - - [1024, 3257, 1, 4096] - - [964, 8790.32] + - [967, 8790.32] - - [4096, 3996, 1, 1024] - - [967, 9788.15] + - [970, 9788.15] - - [64, 143, 432, 143] - - [995, 6087.14] + - [998, 6087.14] - - [1024, 3306, 1, 4096] - - [979, 9035.59] + - [982, 9035.59] - - [1024, 3389, 1, 4096] - - [986, 9134.82] + - [989, 9134.82] - - [1024, 3500, 1, 4096] - - [986, 9443.23] + - [989, 9443.23] - - [1024, 3999, 1, 33708] - - [967, 9741.14] + - [970, 9741.14] - - [4096, 3486, 1, 1024] - - [967, 9719.57] + - [970, 9719.57] - - [1024, 3438, 1, 4096] - - [986, 9259.28] + - [989, 9259.28] - - [4096, 3616, 1, 1024] - - [977, 9739.67] + - [980, 9739.67] - - [1024, 3955, 1, 1024] - - [979, 9260.27] + - [982, 9260.27] - - [4096, 3430, 1, 1024] - - [978, 9819.85] + - [981, 9819.85] - - [4096, 3271, 1, 1024] - - [967, 9801.94] + - [970, 9801.94] - - [1024, 3364, 1, 4096] - - [979, 9144.53] + - [982, 9144.53] - - [64, 54, 1184, 54] - - [999, 4315.68] + - [1002, 4315.68] - - [1024, 3497, 1, 4096] - - [986, 9429.32] + - [989, 9429.32] - - [4096, 3503, 1, 1024] - - [966, 9764.38] + - [969, 9764.38] - - [4096, 3344, 1, 1024] - - [963, 9614.06] + - [966, 9614.06] - - [1024, 3457, 1, 4096] - - [986, 9320.5] + - [989, 9320.5] - - [4096, 3466, 1, 1024] - - [966, 9677.71] + - [969, 9677.71] - - [1024, 3976, 1, 33708] - - [967, 9685.28] + - [970, 9685.28] - - [1024, 3395, 1, 4096] - - [985, 9146.29] + - [988, 9146.29] - - [4096, 3361, 1, 1024] - - [977, 9677.79] + - [980, 9677.79] - - [1024, 3751, 1, 33708] - - [975, 9234.59] + - [978, 9234.59] - - [1024, 3822, 1, 1024] - - [964, 8977.73] + - [967, 8977.73] - - [4096, 3315, 1, 1024] - - [967, 9922.44] + - [970, 9922.44] - - [1024, 3163, 1, 4096] - - [979, 8577.69] + - [982, 8577.69] - - [4096, 3547, 1, 1024] - - [967, 9882.82] + - [970, 9882.82] - - [4096, 3340, 1, 1024] - - [977, 9635.32] + - [980, 9635.32] - - [1024, 3296, 1, 4096] - - [986, 8874.56] + - [989, 8874.56] - - [1024, 3468, 1, 4096] - - [986, 9350.16] + - [989, 9350.16] - - [4096, 3294, 1, 1024] - - [966, 9856.77] + - [969, 9856.77] - - [1024, 3406, 1, 4096] - - [982, 9162.74] + - [985, 9162.74] - - [1024, 3860, 1, 33708] - - [966, 9403.46] + - [969, 9403.46] - - [1024, 3584, 1, 4096] - - [983, 9677.34] + - [986, 9677.34] - - [4096, 3189, 1, 1024] - - [978, 9820.59] + - [981, 9820.59] - - [4096, 3494, 1, 1024] - - [966, 9747.58] + - [969, 9747.58] - - [64, 135, 480, 135] - - [992, 5966.24] + - [995, 5966.24] - - [1024, 3093, 1, 4096] - - [980, 8445.96] + - [983, 8445.96] - - [4096, 3421, 1, 1024] - - [963, 9775.93] + - [966, 9775.93] - - [1024, 3479, 1, 4096] - - [986, 9376.44] + - [989, 9376.44] - - [1024, 3433, 1, 4096] - - [986, 9251.04] + - [989, 9251.04] - - [4096, 3311, 1, 1024] - - [966, 9901.43] + - [969, 9901.43] - - [1024, 3381, 1, 4096] - - [986, 9103.89] + - [989, 9103.89] - - [1024, 3996, 1, 4096] - - [967, 9609.46] + - [970, 9609.46] - - [4096, 3384, 1, 1024] - - [977, 9749.91] + - [980, 9749.91] - - [1024, 3247, 1, 4096] - - [965, 8872.49] + - [968, 8872.49] - - [1024, 3169, 1, 4096] - - [964, 8597.51] + - [967, 8597.51] - - [1024, 3088, 1, 4096] - - [980, 8409.97] + - [983, 8409.97] - - [1024, 3363, 1, 4096] - - [986, 9069.4] + - [989, 9069.4] - - [1024, 3538, 1, 4096] - - [985, 9529.58] + - [988, 9529.58] - - [1024, 3996, 1, 1024] - - [969, 9322.96] + - [972, 9322.96] - - [4096, 3169, 1, 1024] - - [963, 9821.3] + - [966, 9821.3] - - [4096, 3538, 1, 1024] - - [966, 9859.32] + - [969, 9859.32] - - [4096, 3401, 1, 1024] - - [963, 9754.4] + - [966, 9754.4] - - [4096, 3581, 1, 1024] - - [966, 9960.61] + - [969, 9960.61] - - [1024, 3180, 1, 4096] - - [964, 8634.95] + - [967, 8634.95] - - [1024, 3870, 1, 1024] - - [965, 9085.59] + - [968, 9085.59] - - [4096, 3555, 1, 1024] - - [966, 9905.64] + - [969, 9905.64] - - [4096, 3412, 1, 1024] - - [978, 9778.46] + - [981, 9778.46] - - [4096, 3302, 1, 1024] - - [966, 9888.61] + - [969, 9888.61] - - [1024, 3561, 1, 4096] - - [982, 9596.95] + - [985, 9596.95] - - [1024, 3302, 1, 4096] - - [986, 8900.77] + - [989, 8900.77] - - [1024, 3976, 1, 4096] - - [968, 9563.12] + - [971, 9563.12] - - [4096, 3485, 1, 1024] - - [966, 9722.47] + - [969, 9722.47] - - [4096, 3534, 1, 1024] - - [966, 9847.12] + - [969, 9847.12] - - [1024, 3110, 1, 4096] - - [979, 8458.46] + - [982, 8458.46] - - [1024, 3401, 1, 4096] - - [986, 9174.71] + - [989, 9174.71] - - [4096, 3216, 1, 1024] - - [966, 9645.39] + - [969, 9645.39] - - [1024, 4020, 1, 33708] - - [966, 9793.51] + - [969, 9793.51] - - [1024, 3215, 1, 4096] - - [986, 8677.41] + - [989, 8677.41] - - [4096, 3566, 1, 1024] - - [966, 9924.68] + - [969, 9924.68] - - [1024, 3137, 1, 4096] - - [964, 8546.97] + - [967, 8546.97] - - [4096, 3359, 1, 1024] - - [963, 9673.63] + - [966, 9673.63] - - [4096, 3392, 1, 1024] - - [978, 9757.41] + - [981, 9757.41] - - [1024, 3506, 1, 4096] - - [986, 9442.9] + - [989, 9442.9] - - [4096, 3233, 1, 1024] - - [966, 9698.6] + - [969, 9698.6] - - [1024, 3444, 1, 4096] - - [986, 9275.44] + - [989, 9275.44] - - [1024, 3975, 1, 4096] - - [967, 9556.77] + - [970, 9556.77] - - [1024, 3870, 1, 33708] - - [966, 9427.34] + - [969, 9427.34] - - [4096, 3465, 1, 1024] - - [967, 9674.91] + - [970, 9674.91] - - [4096, 3968, 1, 1024] - - [963, 9927.83] + - [966, 9927.83] - - [1024, 3523, 1, 4096] - - [986, 9494.05] + - [989, 9494.05] - - [64, 10, 5952, 10] - - [990, 1224.06] + - [993, 1224.06] - - [4096, 3990, 1, 1024] - - [966, 9771.17] + - [969, 9771.17] - - [1024, 3549, 1, 4096] - - [985, 9553.32] + - [988, 9553.32] - - [1024, 3342, 1, 4096] - - [986, 9007.21] + - [989, 9007.21] - - [4096, 3476, 1, 1024] - - [966, 9703.56] + - [969, 9703.56] - - [64, 232, 272, 228] - - [972, 7078.83] + - [975, 7078.83] - - [1024, 3418, 1, 4096] - - [986, 9212.99] + - [989, 9212.99] - - [1024, 3859, 1, 1024] - - [965, 9087.44] + - [968, 9087.44] - - [4096, 3339, 1, 1024] - - [978, 9593.9] + - [981, 9593.9] - - [4096, 3452, 1, 1024] - - [963, 9872.59] + - [966, 9872.59] - - [4096, 3293, 1, 1024] - - [966, 9842.55] + - [969, 9842.55] - - [4096, 3840, 1, 1024] - - [967, 10030.7] + - [970, 10030.7] - - [1024, 3369, 1, 4096] - - [964, 9099.62] + - [967, 9099.62] - - [64, 193, 320, 193] - - [994, 6425.7] + - [997, 6425.7] - - [1024, 3544, 1, 4096] - - [983, 9556.54] + - [986, 9556.54] - - [4096, 3493, 1, 1024] - - [967, 9743.24] + - [970, 9743.24] - - [4096, 3350, 1, 1024] - - [978, 9653.01] + - [981, 9653.01] - - [64, 71, 896, 71] - - [1018, 4686.63] + - [1021, 4686.63] - - [4096, 3256, 1, 1024] - - [966, 9763.68] + - [969, 9763.68] - - [1024, 3870, 1, 4096] - - [968, 9305.18] + - [971, 9305.18] - - [4096, 4012, 1, 1024] - - [967, 9817.25] + - [970, 9817.25] - - [1024, 3280, 1, 4096] - - [986, 8841.92] + - [989, 8841.92] - - [4096, 3456, 1, 1024] - - [962, 9874.33] + - [965, 9874.33] - - [1024, 3555, 1, 4096] - - [985, 9599.53] + - [988, 9599.53] - - [4096, 3014, 1, 1024] - - [966, 9762.18] + - [969, 9762.18] - - [1024, 3474, 1, 4096] - - [986, 9373.57] + - [989, 9373.57] - - [4096, 3367, 1, 1024] - - [962, 9694.54] + - [965, 9694.54] - - [4096, 3432, 1, 1024] - - [978, 9855.17] + - [981, 9855.17] - - [64, 84, 752, 84] - - [1005, 5247.08] + - [1008, 5247.08] - - [4096, 3273, 1, 1024] - - [967, 9801.77] + - [970, 9801.77] - - [4096, 3130, 1, 1024] - - [963, 9672.42] + - [966, 9672.42] - - [1024, 2984, 1, 4096] - - [968, 9403.6] + - [971, 9403.6] - - [1024, 3995, 1, 1024] - - [980, 9392.51] + - [983, 9392.51] - - [1024, 3517, 1, 4096] - - [986, 9481.29] + - [989, 9481.29] - - [1024, 3455, 1, 4096] - - [986, 9302.19] + - [989, 9302.19] - - [1024, 3939, 1, 4096] - - [968, 9469.79] + - [971, 9469.79] - - [64, 49, 1296, 49] - - [998, 3938.86] + - [1001, 3938.86] - - [64, 14, 4368, 14] - - [990, 1802.37] + - [993, 1802.37] - - [64, 25, 2512, 25] - - [1009, 2760.44] + - [1012, 2760.44] - - [4096, 3147, 1, 1024] - - [978, 9712.93] + - [981, 9712.93] - - [4096, 3516, 1, 1024] - - [966, 9805.83] + - [969, 9805.83] - - [1024, 3876, 1, 4096] - - [968, 9320.46] + - [971, 9320.46] - - [1024, 3191, 1, 4096] - - [965, 8640.66] + - [968, 8640.66] - - [4096, 3411, 1, 1024] - - [977, 9737.27] + - [980, 9737.27] - - [1024, 3337, 1, 4096] - - [986, 8990.03] + - [989, 8990.03] - - [1024, 3512, 1, 4096] - - [986, 9459.55] + - [989, 9459.55] - - [4096, 3301, 1, 1024] - - [966, 9877.16] + - [969, 9877.16] - - [1024, 3450, 1, 4096] - - [985, 9283.01] + - [988, 9283.01] - - [4096, 3533, 1, 1024] - - [966, 9848.52] + - [969, 9848.52] - - [4096, 3390, 1, 1024] - - [978, 9764.51] + - [981, 9764.51] - - [4096, 3231, 1, 1024] - - [966, 9693.71] + - [969, 9693.71] - - [1024, 2499, 1, 4096] - - [985, 9304.71] + - [988, 9304.71] - - [1024, 3186, 1, 4096] - - [965, 8649.45] + - [968, 8649.45] - - [1024, 3380, 1, 4096] - - [986, 9101.67] + - [989, 9101.67] - - [4096, 3496, 1, 1024] - - [967, 9754.2] + - [970, 9754.2] - - [1024, 3956, 1, 33708] - - [966, 9636.67] + - [969, 9636.67] - - [1024, 3976, 1, 1024] - - [968, 9248.31] + - [971, 9248.31] - - [4096, 2736, 1, 1024] - - [966, 9651.81] + - [969, 9651.81] - - [1024, 3291, 1, 4096] - - [986, 8868.84] + - [989, 8868.84] - - [1024, 3944, 1, 33708] - - [967, 9606.9] + - [970, 9606.9] - - [1024, 3485, 1, 4096] - - [985, 9385.86] + - [988, 9385.86] - - [4096, 3138, 1, 1024] - - [963, 9672.05] + - [966, 9672.05] - - [1024, 3423, 1, 4096] - - [986, 9222.67] + - [989, 9222.67] - - [1024, 3491, 1, 4096] - - [986, 9404.92] + - [989, 9404.92] - - [1024, 3860, 1, 4096] - - [969, 9282.84] + - [972, 9282.84] - - [4096, 3211, 1, 1024] - - [966, 9640.32] + - [969, 9640.32] - - [1024, 3221, 1, 4096] - - [980, 8709.3] + - [983, 8709.3] - - [1024, 2917, 1, 4096] - - [968, 9177.01] + - [971, 9177.01] - - [4096, 3475, 1, 1024] - - [966, 9703.35] + - [969, 9703.35] - - [4096, 3524, 1, 1024] - - [966, 9816.13] + - [969, 9816.13] - - [4096, 2985, 1, 1024] - - [967, 9686.81] + - [970, 9686.81] - - [1024, 3480, 1, 4096] - - [986, 9380.1] + - [989, 9380.1] - - [4096, 3222, 1, 1024] - - [966, 9666.7] + - [969, 9666.7] - - [4096, 3451, 1, 1024] - - [962, 9877.81] + - [965, 9877.81] - - [1024, 3969, 1, 33708] - - [966, 9669.54] + - [969, 9669.54] - - [1024, 3640, 1, 1024] - - [973, 8565.58] + - [976, 8565.58] - - [1024, 3297, 1, 4096] - - [982, 8889.12] + - [985, 8889.12] - - [4096, 3944, 1, 1024] - - [963, 9902.75] + - [966, 9902.75] - - [1024, 3216, 1, 4096] - - [965, 8695.78] + - [968, 8695.78] - - [1024, 3840, 1, 1024] - - [979, 9045.95] + - [982, 9045.95] - - [4096, 3349, 1, 1024] - - [977, 9676.72] + - [980, 9676.72] - - [4096, 3398, 1, 1024] - - [963, 9775.74] + - [966, 9775.74] - - [1024, 3154, 1, 4096] - - [980, 8662.16] + - [983, 8662.16] - - [1024, 3978, 1, 33708] - - [967, 9689.06] + - [970, 9689.06] - - [1024, 3348, 1, 4096] - - [986, 9014.57] + - [989, 9014.57] - - [4096, 3304, 1, 1024] - - [967, 9886.7] + - [970, 9886.7] - - [4096, 4030, 1, 1024] - - [967, 9859.0] + - [970, 9859.0] - - [1024, 4026, 1, 1024] - - [964, 9326.54] + - [967, 9326.54] - - [4096, 3471, 1, 1024] - - [966, 9682.9] + - [969, 9682.9] - - [1024, 3259, 1, 4096] - - [980, 8792.09] + - [983, 8792.09] - - [64, 132, 480, 132] - - [1020, 6027.76] + - [1023, 6027.76] - - [1024, 3308, 1, 4096] - - [985, 8905.04] + - [988, 8905.04] - - [4096, 3391, 1, 1024] - - [978, 9765.25] + - [981, 9765.25] - - [1024, 3312, 1, 4096] - - [986, 8917.64] + - [989, 8917.64] - - [1024, 3502, 1, 4096] - - [986, 9435.52] + - [989, 9435.52] - - [1024, 3968, 1, 33708] - - [966, 9668.14] + - [969, 9668.14] - - [1024, 3424, 1, 4096] - - [982, 9215.89] + - [985, 9215.89] - - [64, 13, 4672, 13] - - [991, 1662.25] + - [994, 1662.25] - - [4096, 4032, 1, 1024] - - [977, 9877.72] + - [980, 9877.72] - - [1024, 3900, 1, 1024] - - [980, 9116.83] + - [983, 9116.83] - - [4096, 3442, 1, 1024] - - [977, 9773.08] + - [980, 9773.08] - - [1024, 3366, 1, 4096] - - [986, 9079.36] + - [989, 9079.36] - - [4096, 3999, 1, 1024] - - [966, 9786.36] + - [969, 9786.36] - - [1024, 3477, 1, 4096] - - [986, 9364.79] + - [989, 9364.79] - - [1024, 2505, 1, 4096] - - [986, 9303.93] + - [989, 9303.93] - - [4096, 3515, 1, 1024] - - [966, 9797.83] + - [969, 9797.83] - - [1024, 3564, 1, 4096] - - [982, 9632.76] + - [985, 9632.76] - - [4096, 3057, 1, 1024] - - [967, 9880.09] + - [970, 9880.09] - - [1024, 3339, 1, 4096] - - [965, 9029.76] + - [968, 9029.76] - - [4096, 3262, 1, 1024] - - [966, 9780.0] + - [969, 9780.0] - - [1024, 4030, 1, 4096] - - [969, 9681.9] + - [972, 9681.9] - - [1024, 3265, 1, 4096] - - [986, 8797.42] + - [989, 8797.42] - - [1024, 3459, 1, 4096] - - [986, 9312.96] + - [989, 9312.96] - - [4096, 3462, 1, 1024] - - [967, 9669.63] + - [970, 9669.63] - - [64, 85, 752, 85] - - [1005, 5186.83] + - [1008, 5186.83] - - [1024, 3513, 1, 4096] - - [983, 9469.05] + - [986, 9469.05] - - [1024, 3397, 1, 4096] - - [986, 9151.67] + - [989, 9151.67] - - [4096, 3572, 1, 1024] - - [966, 9945.6] + - [969, 9945.6] - - [4096, 3389, 1, 1024] - - [978, 9740.76] + - [981, 9740.76] - - [4096, 3438, 1, 1024] - - [978, 9822.37] + - [981, 9822.37] - - [64, 102, 624, 100] - - [1013, 5486.9] + - [1016, 5486.9] - - [1024, 3640, 1, 33708] - - [974, 9083.43] + - [977, 9083.43] - - [1024, 3995, 1, 33708] - - [967, 9731.89] + - [970, 9731.89] - - [1024, 3165, 1, 4096] - - [979, 8601.8] + - [982, 8601.8] - - [4096, 3543, 1, 1024] - - [967, 9868.53] + - [970, 9868.53] - - [4096, 3352, 1, 1024] - - [962, 9668.34] + - [965, 9668.34] - - [1024, 3359, 1, 4096] - - [983, 9050.23] + - [986, 9050.23] - - [1024, 3470, 1, 4096] - - [986, 9355.07] + - [989, 9355.07] - - [64, 15, 4096, 15] - - [990, 1945.33] + - [993, 1945.33] - - [1024, 3392, 1, 4096] - - [985, 9139.61] + - [988, 9139.61] - - [64, 78, 816, 77] - - [997, 4870.46] + - [1000, 4870.46] - - [4096, 3137, 1, 1024] - - [962, 9600.12] + - [965, 9600.12] - - [4096, 3506, 1, 1024] - - [967, 9778.98] + - [970, 9778.98] - - [1024, 3095, 1, 4096] - - [979, 8381.14] + - [982, 8381.14] - - [1024, 3859, 1, 4096] - - [966, 9288.53] + - [969, 9288.53] - - [4096, 3369, 1, 1024] - - [978, 9697.63] + - [981, 9697.63] - - [64, 45, 1424, 45] - - [1015, 3883.64] + - [1018, 3883.64] - - [1024, 3435, 1, 4096] - - [986, 9264.52] + - [989, 9264.52] - - [1024, 3354, 1, 4096] - - [986, 9035.37] + - [989, 9035.37] - - [1024, 3055, 1, 4096] - - [967, 9597.35] + - [970, 9597.35] - - [4096, 3523, 1, 1024] - - [966, 9821.69] + - [969, 9821.69] - - [4096, 3380, 1, 1024] - - [962, 9721.29] + - [965, 9721.29] - - [1024, 3233, 1, 4096] - - [979, 8724.65] + - [982, 8724.65] - - [4096, 3221, 1, 1024] - - [966, 9660.94] + - [969, 9660.94] - - [4096, 3270, 1, 1024] - - [966, 9797.82] + - [969, 9797.82] - - [4096, 3593, 1, 1024] - - [977, 9679.21] + - [980, 9679.21] - - [1024, 3358, 1, 4096] - - [986, 9051.72] + - [989, 9051.72] - - [1024, 3540, 1, 4096] - - [986, 9533.49] + - [989, 9533.49] - - [4096, 3502, 1, 1024] - - [967, 9760.55] + - [970, 9760.55] - - [4096, 2505, 1, 1024] - - [967, 9680.42] + - [970, 9680.42] - - [4096, 3397, 1, 1024] - - [977, 9785.75] + - [980, 9785.75] - - [1024, 3300, 1, 4096] - - [980, 8907.75] + - [983, 8907.75] - - [4096, 3095, 1, 1024] - - [963, 9618.68] + - [966, 9618.68] - - [1024, 3182, 1, 4096] - - [979, 8606.06] + - [982, 8606.06] - - [1024, 3299, 1, 4096] - - [985, 8885.38] + - [988, 8885.38] - - [1024, 3276, 1, 4096] - - [980, 8872.65] + - [983, 8872.65] - - [1024, 3360, 1, 4096] - - [983, 9044.1] + - [986, 9044.1] - - [4096, 3360, 1, 1024] - - [978, 9681.29] + - [981, 9681.29] - - [4096, 2918, 1, 1024] - - [962, 9732.64] + - [965, 9732.64] - - [1024, 3939, 1, 33708] - - [966, 9595.86] + - [969, 9595.86] - - [4096, 3314, 1, 1024] - - [967, 9914.92] + - [970, 9914.92] - - [1024, 3319, 1, 4096] - - [986, 8956.27] + - [989, 8956.27] - - [64, 35, 1808, 35] - - [1003, 3060.17] + - [1006, 3060.17] - - [1024, 3942, 1, 1024] - - [979, 9211.73] + - [982, 9211.73] - - [1024, 3465, 1, 4096] - - [986, 9340.63] + - [989, 9340.63] - - [4096, 3546, 1, 1024] - - [967, 9875.31] + - [970, 9875.31] - - [1024, 3403, 1, 4096] - - [979, 9224.24] + - [982, 9224.24] - - [1024, 3948, 1, 1024] - - [965, 9245.53] + - [968, 9245.53] - - [4096, 3441, 1, 1024] - - [978, 9758.62] + - [981, 9758.62] - - [1024, 3139, 1, 4096] - - [979, 8582.74] + - [982, 8582.74] - - [1024, 3563, 1, 4096] - - [986, 9620.64] + - [989, 9620.64] - - [1024, 3508, 1, 4096] - - [983, 9449.26] + - [986, 9449.26] - - [1024, 3975, 1, 33708] - - [966, 9683.45] + - [969, 9683.45] - - [1024, 3446, 1, 4096] - - [985, 9289.41] + - [988, 9289.41] - - [1024, 3529, 1, 4096] - - [982, 9491.19] + - [985, 9491.19] - - [64, 112, 576, 112] - - [1007, 6387.04] + - [1010, 6387.04] - - [4096, 3461, 1, 1024] - - [967, 9663.23] + - [970, 9663.23] - - [1024, 3574, 1, 4096] - - [985, 9662.78] + - [988, 9662.78] - - [1024, 3101, 1, 4096] - - [980, 8468.24] + - [983, 8468.24] - - [1024, 3927, 1, 1024] - - [965, 9207.87] + - [968, 9207.87] - - [4096, 3224, 1, 1024] - - [967, 9665.51] + - [970, 9665.51] - - [4096, 3437, 1, 1024] - - [963, 9857.11] + - [966, 9857.11] - - [4096, 3900, 1, 1024] - - [978, 9826.15] + - [981, 9826.15] - - [1024, 3495, 1, 4096] - - [986, 9412.31] + - [989, 9412.31] - - [1024, 3977, 1, 33708] - - [966, 9687.77] + - [969, 9687.77] - - [1024, 3328, 1, 4096] - - [986, 8975.47] + - [989, 8975.47] - - [4096, 3168, 1, 1024] - - [962, 9754.77] + - [965, 9754.77] - - [1024, 4026, 1, 33708] - - [966, 9807.14] + - [969, 9807.14] - - [1024, 3292, 1, 4096] - - [979, 8901.73] + - [982, 8901.73] - - [1024, 3294, 1, 4096] - - [986, 8876.93] + - [989, 8876.93] - - [4096, 3335, 1, 1024] - - [963, 9616.13] + - [966, 9616.13] - - [4096, 3400, 1, 1024] - - [977, 9710.63] + - [980, 9710.63] - - [1024, 3287, 1, 4096] - - [964, 8907.97] + - [967, 8907.97] - - [1024, 3910, 1, 4096] - - [968, 9400.93] + - [971, 9400.93] - - [1024, 3780, 1, 1024] - - [979, 8863.19] + - [982, 8863.19] - - [4096, 3098, 1, 1024] - - [963, 9606.37] + - [966, 9606.37] - - [1024, 3584, 1, 33708] - - [986, 9775.23] + - [989, 9775.23] - - [64, 29, 2176, 29] - - [1008, 3134.93] + - [1011, 3134.93] - - [1024, 3371, 1, 4096] - - [964, 9117.71] + - [967, 9117.71] - - [1024, 3546, 1, 4096] - - [986, 9547.2] + - [989, 9547.2] - - [1024, 4012, 1, 1024] - - [968, 9353.63] + - [971, 9353.63] - - [4096, 3505, 1, 1024] - - [966, 9773.07] + - [969, 9773.07] - - [4096, 3554, 1, 1024] - - [966, 9895.49] + - [969, 9895.49] - - [4096, 3063, 1, 1024] - - [966, 9898.88] + - [969, 9898.88] - - [1024, 3900, 1, 33708] - - [967, 9502.83] + - [970, 9502.83] - - [1024, 3345, 1, 4096] - - [986, 9015.75] + - [989, 9015.75] - - [1024, 3357, 1, 4096] - - [986, 9041.13] + - [989, 9041.13] - - [1024, 3282, 1, 4096] - - [979, 8860.07] + - [982, 8860.07] - - [4096, 3484, 1, 1024] - - [967, 9721.23] + - [970, 9721.23] - - [1024, 3557, 1, 4096] - - [983, 9573.38] + - [986, 9573.38] - - [1024, 3476, 1, 4096] - - [986, 9361.62] + - [989, 9361.62] - - [1024, 3751, 1, 1024] - - [980, 8849.01] + - [983, 8849.01] - - [4096, 3379, 1, 1024] - - [963, 9741.39] + - [966, 9741.39] - - [4096, 3428, 1, 1024] - - [962, 9767.72] + - [965, 9767.72] - - [4096, 3126, 1, 1024] - - [977, 9701.8] + - [980, 9701.8] - - [64, 41, 1552, 41] - - [1012, 3555.59] + - [1015, 3555.59] - - [1024, 3325, 1, 4096] - - [964, 8962.31] + - [967, 8962.31] - - [4096, 3501, 1, 1024] - - [966, 9761.91] + - [969, 9761.91] - - [4096, 3358, 1, 1024] - - [962, 9680.32] + - [965, 9680.32] - - [1024, 3441, 1, 4096] - - [986, 9271.17] + - [989, 9271.17] - - [1024, 3552, 1, 4096] - - [982, 9565.32] + - [985, 9565.32] - - [4096, 3232, 1, 1024] - - [967, 9696.71] + - [970, 9696.71] - - [64, 18, 3440, 18] - - [987, 2059.23] + - [990, 2059.23] - - [1024, 3412, 1, 4096] - - [986, 9199.18] + - [989, 9199.18] - - [1024, 3372, 1, 4096] - - [983, 9083.39] + - [986, 9083.39] - - [1024, 3585, 1, 4096] - - [973, 8710.19] + - [976, 8710.19] - - [4096, 3143, 1, 1024] - - [978, 9692.02] + - [981, 9692.02] - - [4096, 3464, 1, 1024] - - [966, 9661.83] + - [969, 9661.83] - - [1024, 3145, 1, 4096] - - [965, 8526.23] + - [968, 8526.23] - - [4096, 3375, 1, 1024] - - [977, 9734.68] + - [980, 9734.68] - - [4096, 2917, 1, 1024] - - [962, 9714.47] + - [965, 9714.47] - - [4096, 3978, 1, 1024] - - [967, 9741.33] + - [970, 9741.33] - - [1024, 2765, 1, 4096] - - [968, 8706.65] + - [971, 8706.65] - - [64, 148, 432, 148] - - [993, 6372.07] + - [996, 6372.07] - - [1024, 3452, 1, 4096] - - [985, 9301.28] + - [988, 9301.28] - - [4096, 3584, 1, 1024] - - [967, 10005.6] + - [970, 10005.6] - - [4096, 3545, 1, 1024] - - [967, 9877.77] + - [970, 9877.77] - - [1024, 3352, 1, 4096] - - [986, 9035.09] + - [989, 9035.09] - - [64, 159, 400, 160] - - [995, 6952.01] + - [998, 6952.01] - - [4096, 3292, 1, 1024] - - [966, 9856.41] + - [969, 9856.41] - - [1024, 3525, 1, 4096] - - [986, 9501.4] + - [989, 9501.4] - - [1024, 3266, 1, 4096] - - [986, 8817.33] + - [989, 8817.33] - - [1024, 3382, 1, 4096] - - [985, 9101.44] + - [988, 9101.44] - - [4096, 3492, 1, 1024] - - [966, 9747.19] + - [969, 9747.19] - - [4096, 3419, 1, 1024] - - [978, 9745.78] + - [981, 9745.78] - - [1024, 3796, 1, 33708] - - [975, 9356.16] + - [978, 9356.16] - - [1024, 3293, 1, 4096] - - [982, 8868.3] + - [985, 8868.3] - - [4096, 3796, 1, 1024] - - [967, 9885.26] + - [970, 9885.26] - - [1024, 3487, 1, 4096] - - [983, 9391.24] + - [986, 9391.24] - - [4096, 3166, 1, 1024] - - [978, 9718.36] + - [981, 9718.36] - - [64, 102, 624, 101] - - [1007, 5547.74] + - [1010, 5547.74] - - [1024, 3409, 1, 4096] - - [986, 9187.78] + - [989, 9187.78] - - [1024, 3520, 1, 4096] - - [985, 9484.99] + - [988, 9484.99] - - [1024, 3573, 1, 4096] - - [986, 9652.61] + - [989, 9652.61] - - [4096, 3366, 1, 1024] - - [962, 9684.21] + - [965, 9684.21] - - [4096, 3720, 1, 1024] - - [978, 9703.24] + - [981, 9703.24] - - [4096, 3207, 1, 1024] - - [966, 9626.11] + - [969, 9626.11] - - [4096, 3272, 1, 1024] - - [966, 9795.41] + - [969, 9795.41] - - [1024, 3390, 1, 4096] - - [986, 9125.78] + - [989, 9125.78] - - [4096, 3183, 1, 1024] - - [978, 9825.77] + - [981, 9825.77] - - [4096, 3536, 1, 1024] - - [967, 9846.41] + - [970, 9846.41] - - [4096, 3563, 1, 1024] - - [967, 9913.7] + - [970, 9913.7] - - [1024, 3482, 1, 4096] - - [986, 9376.81] + - [989, 9376.81] - - [4096, 3447, 1, 1024] - - [977, 9874.99] + - [980, 9874.99] - - [4096, 3955, 1, 1024] - - [962, 9922.29] + - [965, 9922.29] - - [4096, 4005, 1, 1024] - - [967, 9803.33] + - [970, 9803.33] - - [1024, 3493, 1, 4096] - - [986, 9411.27] + - [989, 9411.27] - - [4096, 3410, 1, 1024] - - [962, 9788.24] + - [965, 9788.24] - - [1024, 3422, 1, 4096] - - [985, 9216.18] + - [988, 9216.18] - - [1024, 3350, 1, 4096] - - [980, 9067.92] + - [983, 9067.92] - - [4096, 3300, 1, 1024] - - [967, 9883.19] + - [970, 9883.19] - - [4096, 3910, 1, 1024] - - [977, 9800.02] + - [980, 9800.02] - - [1024, 3489, 1, 4096] - - [986, 9398.56] + - [989, 9398.56] - - [4096, 3483, 1, 1024] - - [966, 9715.86] + - [969, 9715.86] - - [4096, 3532, 1, 1024] - - [967, 9837.89] + - [970, 9837.89] - - [64, 101, 624, 101] - - [1007, 5452.18] + - [1010, 5452.18] - - [4096, 3230, 1, 1024] - - [967, 9683.5] + - [970, 9683.5] - - [4096, 3427, 1, 1024] - - [962, 9760.62] + - [965, 9760.62] - - [1024, 3377, 1, 4096] - - [986, 9101.07] + - [989, 9101.07] - - [1024, 3488, 1, 4096] - - [985, 9381.89] + - [988, 9381.89] - - [1024, 3616, 1, 4096] - - [968, 8709.23] + - [971, 8709.23] - - [1024, 3426, 1, 4096] - - [986, 9229.33] + - [989, 9229.33] - - [4096, 3357, 1, 1024] - - [978, 9668.4] + - [981, 9668.4] - - [4096, 3406, 1, 1024] - - [963, 9748.47] + - [966, 9748.47] - - [1024, 3046, 1, 4096] - - [968, 9590.33] + - [971, 9590.33] - - [1024, 3272, 1, 4096] - - [979, 8930.1] + - [982, 8930.1] - - [1024, 3256, 1, 4096] - - [964, 8828.06] + - [967, 8828.06] - - [4096, 3247, 1, 1024] - - [966, 9741.71] + - [969, 9741.71] - - [4096, 3088, 1, 1024] - - [978, 9588.97] + - [981, 9588.97] - - [1024, 3531, 1, 4096] - - [985, 9500.96] + - [988, 9500.96] - - [64, 160, 400, 160] - - [1021, 7333.93] + - [1024, 7333.93] - - [4096, 3511, 1, 1024] - - [967, 9789.28] + - [970, 9789.28] - - [1024, 3720, 1, 33708] - - [976, 9214.58] + - [979, 9214.58] - - [1024, 3267, 1, 4096] - - [979, 8830.94] + - [982, 8830.94] - - [1024, 3270, 1, 4096] - - [980, 8876.58] + - [983, 8876.58] - - [1024, 3461, 1, 4096] - - [985, 9327.45] + - [988, 9327.45] - - [4096, 3474, 1, 1024] - - [966, 9696.94] + - [969, 9696.94] - - [4096, 2984, 1, 1024] - - [967, 9673.98] + - [970, 9673.98] - - [1024, 3399, 1, 4096] - - [985, 9158.48] + - [988, 9158.48] - - [4096, 3574, 1, 1024] - - [966, 9942.2] + - [969, 9942.2] - - [1024, 3876, 1, 1024] - - [980, 9085.03] + - [983, 9085.03] - - [4096, 3337, 1, 1024] - - [963, 9611.33] + - [966, 9611.33] - - [4096, 3450, 1, 1024] - - [978, 9930.25] + - [981, 9930.25] - - [1024, 3720, 1, 1024] - - [964, 8755.39] + - [967, 8755.39] - - [1024, 4059, 1, 1024] - - [969, 9366.57] + - [972, 9366.57] - - [4096, 3291, 1, 1024] - - [966, 9856.23] + - [969, 9856.23] - - [64, 93, 688, 93] - - [1010, 5497.01] + - [1013, 5497.01] - - [4096, 3995, 1, 1024] - - [966, 9776.57] + - [969, 9776.57] - - [64, 147, 432, 147] - - [996, 6233.78] + - [999, 6233.78] - - [4096, 3491, 1, 1024] - - [966, 9742.84] + - [969, 9742.84] - - [4096, 3348, 1, 1024] - - [978, 9634.01] + - [981, 9634.01] - - [4096, 3925, 1, 1024] - - [977, 9848.44] + - [980, 9848.44] - - [4096, 3894, 1, 1024] - - [977, 9812.45] + - [980, 9812.45] - - [1024, 3456, 1, 4096] - - [986, 9317.81] + - [989, 9317.81] - - [1024, 3394, 1, 4096] - - [985, 9148.76] + - [988, 9148.76] - - [64, 100, 624, 102] - - [1007, 5416.85] + - [1010, 5416.85] - - [4096, 3165, 1, 1024] - - [977, 9743.25] + - [980, 9743.25] - - [4096, 3470, 1, 1024] - - [967, 9690.94] + - [970, 9690.94] - - [1024, 3014, 1, 4096] - - [968, 9486.16] + - [971, 9486.16] - - [1024, 3375, 1, 4096] - - [986, 9082.61] + - [989, 9082.61] - - [4096, 3859, 1, 1024] - - [977, 9738.77] + - [980, 9738.77] - - [4096, 3365, 1, 1024] - - [978, 9694.64] + - [981, 9694.64] - - [1024, 3162, 1, 4096] - - [979, 8550.21] + - [982, 8550.21] - - [1024, 3840, 1, 33708] - - [976, 9408.98] + - [979, 9408.98] - - [1024, 3437, 1, 4096] - - [986, 9270.39] + - [989, 9270.39] - - [4096, 3319, 1, 1024] - - [967, 9927.05] + - [970, 9927.05] - - [1024, 3320, 1, 4096] - - [986, 8962.19] + - [989, 8962.19] - - [64, 23, 2720, 23] - - [1009, 2569.43] + - [1012, 2569.43] - - [4096, 3328, 1, 1024] - - [966, 9997.31] + - [969, 9997.31] - - [1024, 3235, 1, 4096] - - [986, 8724.21] + - [989, 8724.21] - - [4096, 3282, 1, 1024] - - [967, 9827.03] + - [970, 9827.03] - - [1024, 3367, 1, 4096] - - [979, 9083.92] + - [982, 9083.92] - - [1024, 3542, 1, 4096] - - [986, 9533.0] + - [989, 9533.0] - - [64, 177, 352, 177] - - [972, 6817.81] + - [975, 6817.81] - - [4096, 3145, 1, 1024] - - [963, 9710.18] + - [966, 9710.18] - - [4096, 3514, 1, 1024] - - [966, 9792.96] + - [969, 9792.96] - - [1024, 3432, 1, 4096] - - [986, 9249.29] + - [989, 9249.29] - - [4096, 3409, 1, 1024] - - [962, 9721.5] + - [965, 9721.5] - - [1024, 4012, 1, 33708] - - [966, 9773.25] + - [969, 9773.25] - - [4096, 3876, 1, 1024] - - [963, 9745.55] + - [966, 9745.55] - - [4096, 3299, 1, 1024] - - [966, 9873.43] + - [969, 9873.43] - - [1024, 3168, 1, 4096] - - [979, 8597.03] + - [982, 8597.03] - - [4096, 3681, 1, 1024] - - [978, 9839.93] + - [981, 9839.93] - - [4096, 3531, 1, 1024] - - [967, 9847.66] + - [970, 9847.66] - - [4096, 3388, 1, 1024] - - [978, 9772.18] + - [981, 9772.18] - - [1024, 3720, 1, 4096] - - [967, 8951.5] + - [970, 8951.5] - - [1024, 3332, 1, 4096] - - [986, 8978.87] + - [989, 8978.87] - - [1024, 3273, 1, 4096] - - [980, 8982.39] + - [983, 8982.39] - - [1024, 2935, 1, 4096] - - [969, 9224.79] + - [972, 9224.79] - - [1024, 3467, 1, 4096] - - [983, 9329.23] + - [986, 9329.23] - - [4096, 3542, 1, 1024] - - [966, 9858.41] + - [969, 9858.41] - - [1024, 3130, 1, 4096] - - [965, 8526.56] + - [968, 8526.56] - - [1024, 3405, 1, 4096] - - [986, 9163.34] + - [989, 9163.34] - - [1024, 3960, 1, 1024] - - [964, 9280.26] + - [967, 9280.26] - - [4096, 3405, 1, 1024] - - [977, 9710.1] + - [980, 9710.1] - - [512, 512, 1, 1024] - - [1163, 6670.86] + - [1166, 6670.86] - - [8, 500, 1, 512] - - [1059, 228.571] + - [1062, 228.571] - - [512, 512, 1, 2000] - - [1196, 7629.34] + - [1199, 7629.34] - - [32, 512, 1, 512] - - [1056, 903.945] + - [1059, 903.945] - - [100, 1024, 1, 2048] - - [1118, 3196.88] + - [1121, 3196.88] - - [8, 512, 1, 500] - - [1049, 237.037] + - [1052, 237.037] - - [8, 500, 1, 1024] - - [1113, 289.266] + - [1116, 289.266] - - [100, 2000, 1, 1024] - - [1152, 3368.42] + - [1155, 3368.42] - - [64, 1024, 1, 100] - - [1051, 941.609] + - [1054, 941.609] - - [64, 1024, 1, 500] - - [1178, 2659.74] + - [1181, 2659.74] - - [64, 1024, 1, 1024] - - [1116, 2452.81] + - [1119, 2452.81] - - [128, 2000, 1, 100] - - [1172, 2560.0] + - [1175, 2560.0] - - [2, 500, 1, 2048] - - [1113, 72.1127] + - [1116, 72.1127] - - [16, 512, 1, 10] - - [1027, 18.2857] + - [1030, 18.2857] - - [64, 2000, 1, 1024] - - [1183, 2800.68] + - [1186, 2800.68] - - [100, 1024, 1, 1024] - - [1111, 3034.07] + - [1114, 3034.07] - - [8, 512, 1, 10] - - [1089, 9.14286] + - [1092, 9.14286] - - [16, 500, 1, 2048] - - [1113, 565.746] + - [1116, 565.746] - - [10, 100, 1, 500] - - [1049, 58.4112] + - [1052, 58.4112] - - [16, 100, 1, 10] - - [1089, 3.57143] + - [1092, 3.57143] - - [500, 1024, 1, 512] - - [1179, 6514.51] + - [1182, 6514.51] - - [128, 1024, 1, 512] - - [1197, 4194.3] + - [1200, 4194.3] - - [512, 500, 1, 2000] - - [1155, 7347.88] + - [1158, 7347.88] - - [2, 100, 1, 2000] - - [1049, 20.8333] + - [1052, 20.8333] - - [500, 512, 1, 100] - - [1171, 2539.68] + - [1174, 2539.68] - - [100, 1024, 1, 500] - - [1197, 3216.08] + - [1200, 3216.08] - - [256, 100, 1, 2048] - - [1207, 1689.07] + - [1210, 1689.07] - - [2, 512, 1, 512] - - [1063, 50.4123] + - [1066, 50.4123] - - [128, 2000, 1, 512] - - [1183, 4641.36] + - [1186, 4641.36] - - [2, 100, 1, 10] - - [1027, 0.396825] + - [1030, 0.396825] - - [16, 2000, 1, 2048] - - [1071, 1266.15] + - [1074, 1266.15] - - [200, 100, 1, 100] - - [1217, 316.456] + - [1220, 316.456] - - [256, 1024, 1, 100] - - [1173, 2685.9] + - [1176, 2685.9] - - [200, 500, 1, 1024] - - [1222, 3282.05] + - [1225, 3282.05] - - [500, 100, 1, 100] - - [1136, 631.313] + - [1139, 631.313] - - [4, 100, 1, 10] - - [1034, 0.877193] + - [1037, 0.877193] - - [32, 100, 1, 512] - - [1113, 198.835] + - [1116, 198.835] - - [100, 2000, 1, 512] - - [1183, 3832.34] + - [1186, 3832.34] - - [16, 1024, 1, 512] - - [1097, 794.376] + - [1100, 794.376] - - [200, 512, 1, 100] - - [1215, 1306.12] + - [1218, 1306.12] - - [4, 1024, 1, 1024] - - [1056, 213.125] + - [1059, 213.125] - - [512, 1024, 1, 512] - - [1180, 7049.25] + - [1183, 7049.25] - - [4, 512, 1, 10] - - [1088, 4.49123] + - [1091, 4.49123] - - [2, 2048, 1, 2000] - - [1049, 300.293] + - [1052, 300.293] - - [64, 2048, 1, 10] - - [1209, 240.941] + - [1212, 240.941] - - [128, 100, 1, 10] - - [1214, 27.5862] + - [1217, 27.5862] - - [4, 512, 1, 2048] - - [1049, 146.449] + - [1052, 146.449] - - [64, 2048, 1, 500] - - [1189, 4015.69] + - [1192, 4015.69] - - [512, 512, 1, 512] - - [1144, 6123.07] + - [1147, 6123.07] - - [500, 500, 1, 2000] - - [1155, 7126.57] + - [1158, 7126.57] - - [10, 1024, 1, 2000] - - [1122, 807.571] + - [1125, 807.571] - - [256, 100, 1, 100] - - [1134, 296.296] + - [1137, 296.296] - - [32, 2000, 1, 2048] - - [1077, 2167.2] + - [1080, 2167.2] - - [64, 1024, 1, 2048] - - [1110, 2383.13] + - [1113, 2383.13] - - [200, 2048, 1, 512] - - [1185, 5263.94] + - [1188, 5263.94] - - [256, 500, 1, 10] - - [1167, 210.526] + - [1170, 210.526] - - [16, 1024, 1, 100] - - [1047, 262.564] + - [1050, 262.564] - - [32, 1024, 1, 1024] - - [1052, 1476.87] + - [1055, 1476.87] - - [512, 500, 1, 512] - - [1141, 5851.43] + - [1144, 5851.43] - - [128, 1024, 1, 2000] - - [1225, 5516.5] + - [1228, 5516.5] - - [8, 100, 1, 500] - - [1049, 46.2963] + - [1052, 46.2963] - - [100, 2000, 1, 2048] - - [1204, 3715.53] + - [1207, 3715.53] - - [10, 512, 1, 512] - - [1059, 292.571] + - [1062, 292.571] - - [8, 500, 1, 10] - - [1088, 8.77193] + - [1091, 8.77193] - - [10, 2000, 1, 1024] - - [1102, 640.0] + - [1105, 640.0] - - [16, 1024, 1, 10] - - [1087, 36.5714] + - [1090, 36.5714] - - [16, 512, 1, 2048] - - [1066, 585.797] + - [1069, 585.797] - - [256, 512, 1, 10] - - [1132, 230.761] + - [1135, 230.761] - - [2, 2000, 1, 100] - - [1094, 64.1026] + - [1097, 64.1026] - - [128, 512, 1, 2048] - - [1061, 3106.89] + - [1064, 3106.89] - - [128, 512, 1, 100] - - [1054, 952.558] + - [1057, 952.558] - - [512, 2000, 1, 1024] - - [1151, 8065.97] + - [1154, 8065.97] - - [64, 500, 1, 2048] - - [1220, 1857.6] + - [1223, 1857.6] - - [64, 2000, 1, 2048] - - [1202, 3442.02] + - [1205, 3442.02] - - [64, 2048, 1, 512] - - [1203, 3315.66] + - [1206, 3315.66] - - [10, 2000, 1, 512] - - [1049, 785.276] + - [1052, 785.276] - - [32, 2000, 1, 500] - - [1052, 2500.0] + - [1055, 2500.0] - - [64, 2000, 1, 10] - - [1040, 231.884] + - [1043, 231.884] - - [500, 100, 1, 10] - - [1137, 88.0282] + - [1140, 88.0282] - - [128, 1024, 1, 500] - - [1188, 4096.0] + - [1191, 4096.0] - - [64, 100, 1, 2048] - - [1049, 587.24] + - [1052, 587.24] - - [64, 100, 1, 10] - - [1208, 11.9403] + - [1211, 11.9403] - - [16, 512, 1, 500] - - [1059, 461.261] + - [1062, 461.261] - - [32, 2000, 1, 1024] - - [1046, 1713.81] + - [1049, 1713.81] - - [200, 512, 1, 1024] - - [1225, 3244.36] + - [1228, 3244.36] - - [128, 2048, 1, 10] - - [1041, 455.111] + - [1044, 455.111] - - [200, 100, 1, 2000] - - [1049, 1461.99] + - [1052, 1461.99] - - [2, 100, 1, 512] - - [1049, 12.4272] + - [1052, 12.4272] - - [64, 2048, 1, 100] - - [1215, 1689.07] + - [1218, 1689.07] - - [32, 512, 1, 100] - - [1048, 265.974] + - [1051, 265.974] - - [16, 512, 1, 1024] - - [1113, 569.878] + - [1116, 569.878] - - [4, 1024, 1, 512] - - [1103, 208.051] + - [1106, 208.051] - - [64, 2000, 1, 100] - - [1215, 1649.48] + - [1218, 1649.48] - - [512, 2048, 1, 512] - - [1151, 7848.99] + - [1154, 7848.99] - - [2, 500, 1, 500] - - [1037, 53.4188] + - [1040, 53.4188] - - [32, 100, 1, 100] - - [1048, 57.1429] + - [1051, 57.1429] - - [100, 500, 1, 2000] - - [1052, 2783.96] + - [1055, 2783.96] - - [200, 2000, 1, 100] - - [1124, 2994.01] + - [1127, 2994.01] - - [10, 512, 1, 10] - - [1084, 11.0345] + - [1087, 11.0345] - - [100, 500, 1, 2048] - - [1224, 2361.62] + - [1227, 2361.62] - - [4, 2048, 1, 500] - - [1059, 379.259] + - [1062, 379.259] - - [200, 500, 1, 100] - - [1185, 1288.66] + - [1188, 1288.66] - - [500, 500, 1, 500] - - [1141, 5425.35] + - [1144, 5425.35] - - [2, 100, 1, 1024] - - [1113, 16.2025] + - [1116, 16.2025] - - [128, 2048, 1, 512] - - [1199, 4699.5] + - [1202, 4699.5] - - [200, 2000, 1, 1024] - - [1149, 4620.94] + - [1152, 4620.94] - - [32, 512, 1, 1024] - - [1112, 1028.02] + - [1115, 1028.02] - - [100, 2048, 1, 500] - - [1173, 4142.39] + - [1176, 4142.39] - - [256, 100, 1, 1024] - - [1203, 1443.52] + - [1206, 1443.52] - - [16, 2000, 1, 500] - - [1098, 1428.57] + - [1101, 1428.57] - - [128, 100, 1, 100] - - [1048, 213.333] + - [1051, 213.333] - - [500, 500, 1, 2048] - - [1145, 6639.0] + - [1148, 6639.0] - - [32, 512, 1, 10] - - [1081, 35.9298] + - [1084, 35.9298] - - [128, 100, 1, 1024] - - [1109, 791.498] + - [1112, 791.498] - - [16, 500, 1, 2000] - - [1122, 694.444] + - [1125, 694.444] - - [4, 2048, 1, 100] - - [1093, 129.62] + - [1096, 129.62] - - [64, 500, 1, 500] - - [1035, 1333.33] + - [1038, 1333.33] - - [500, 1024, 1, 2048] - - [1154, 7031.76] + - [1157, 7031.76] - - [512, 2048, 1, 100] - - [1129, 5285.16] + - [1132, 5285.16] - - [128, 512, 1, 1024] - - [1221, 2519.1] + - [1224, 2519.1] - - [128, 512, 1, 2000] - - [1219, 3608.81] + - [1222, 3608.81] - - [128, 2000, 1, 2000] - - [1192, 7017.54] + - [1195, 7017.54] - - [2, 512, 1, 10] - - [1085, 2.03175] + - [1088, 2.03175] - - [10, 512, 1, 500] - - [1049, 293.578] + - [1052, 293.578] - - [4, 1024, 1, 2000] - - [1069, 326.115] + - [1072, 326.115] - - [256, 100, 1, 2000] - - [1206, 1767.96] + - [1209, 1767.96] - - [512, 2048, 1, 2000] - - [1151, 8674.52] + - [1154, 8674.52] - - [100, 100, 1, 10] - - [1213, 21.5517] + - [1216, 21.5517] - - [256, 500, 1, 1024] - - [1153, 4833.04] + - [1156, 4833.04] - - [128, 512, 1, 10] - - [1041, 132.129] + - [1044, 132.129] - - [256, 100, 1, 500] - - [1200, 914.286] + - [1203, 914.286] - - [64, 100, 1, 512] - - [1107, 369.009] + - [1110, 369.009] - - [64, 512, 1, 500] - - [1049, 1600.0] + - [1052, 1600.0] - - [64, 2048, 1, 2000] - - [1203, 5925.5] + - [1206, 5925.5] - - [100, 2048, 1, 1024] - - [1161, 3260.5] + - [1164, 3260.5] - - [200, 2000, 1, 10] - - [1041, 595.238] + - [1044, 595.238] - - [128, 1024, 1, 100] - - [1185, 1689.07] + - [1188, 1689.07] - - [16, 2000, 1, 100] - - [1048, 493.827] + - [1051, 493.827] - - [8, 100, 1, 512] - - [1049, 49.7087] + - [1052, 49.7087] - - [500, 2048, 1, 1024] - - [1151, 7651.61] + - [1154, 7651.61] - - [500, 2000, 1, 10] - - [1139, 1008.06] + - [1142, 1008.06] - - [32, 100, 1, 500] - - [1113, 186.916] + - [1116, 186.916] - - [256, 1024, 1, 2048] - - [1154, 6190.85] + - [1157, 6190.85] - - [32, 500, 1, 2048] - - [1049, 1083.6] + - [1052, 1083.6] - - [4, 2000, 1, 10] - - [1092, 17.5439] + - [1095, 17.5439] - - [128, 500, 1, 2000] - - [1109, 3516.48] + - [1112, 3516.48] - - [8, 1024, 1, 10] - - [1083, 17.9649] + - [1086, 17.9649] - - [2, 500, 1, 100] - - [1028, 16.0256] + - [1031, 16.0256] - - [10, 500, 1, 512] - - [1049, 290.909] + - [1052, 290.909] - - [10, 2000, 1, 10] - - [1027, 38.4615] + - [1030, 38.4615] - - [500, 512, 1, 512] - - [1144, 5893.53] + - [1147, 5893.53] - - [32, 500, 1, 500] - - [1049, 892.857] + - [1052, 892.857] - - [256, 500, 1, 2000] - - [1158, 6237.82] + - [1161, 6237.82] - - [100, 500, 1, 100] - - [1060, 726.744] + - [1063, 726.744] - - [500, 2048, 1, 100] - - [1133, 4866.92] + - [1136, 4866.92] - - [10, 1024, 1, 512] - - [1049, 520.127] + - [1052, 520.127] - - [2, 2048, 1, 512] - - [1059, 151.528] + - [1062, 151.528] - - [256, 512, 1, 100] - - [1138, 1590.68] + - [1141, 1590.68] - - [10, 2048, 1, 100] - - [1049, 324.051] + - [1052, 324.051] - - [8, 2048, 1, 100] - - [1104, 256.0] + - [1107, 256.0] - - [512, 100, 1, 512] - - [1200, 2100.51] + - [1203, 2100.51] - - [4, 500, 1, 500] - - [1049, 115.741] + - [1052, 115.741] - - [64, 100, 1, 1024] - - [1049, 450.11] + - [1052, 450.11] - - [2, 2048, 1, 1024] - - [1106, 137.608] + - [1109, 137.608] - - [2, 500, 1, 2000] - - [1075, 90.2527] + - [1078, 90.2527] - - [512, 1024, 1, 500] - - [1180, 6898.53] + - [1183, 6898.53] - - [128, 2000, 1, 500] - - [1185, 5161.29] + - [1188, 5161.29] - - [32, 512, 1, 2048] - - [1119, 1103.76] + - [1122, 1103.76] - - [10, 100, 1, 2000] - - [1049, 105.932] + - [1052, 105.932] - - [4, 100, 1, 512] - - [1049, 24.6154] + - [1052, 24.6154] - - [2, 512, 1, 2048] - - [1113, 73.2246] + - [1116, 73.2246] - - [200, 512, 1, 2048] - - [1225, 3953.91] + - [1228, 3953.91] - - [200, 2000, 1, 2000] - - [1187, 6230.53] + - [1190, 6230.53] - - [100, 100, 1, 2000] - - [1049, 827.815] + - [1052, 827.815] - - [500, 2048, 1, 2000] - - [1150, 8387.94] + - [1153, 8387.94] - - [64, 2048, 1, 2048] - - [1195, 3406.54] + - [1198, 3406.54] - - [16, 2000, 1, 1024] - - [1055, 1024.0] + - [1058, 1024.0] - - [512, 2048, 1, 1024] - - [1128, 8061.12] + - [1131, 8061.12] - - [10, 500, 1, 500] - - [1059, 284.091] + - [1062, 284.091] - - [200, 1024, 1, 2048] - - [1223, 4886.19] + - [1226, 4886.19] - - [10, 2000, 1, 2000] - - [1049, 1449.28] + - [1052, 1449.28] - - [8, 2000, 1, 500] - - [1098, 719.424] + - [1101, 719.424] - - [2, 100, 1, 2048] - - [1113, 19.845] + - [1116, 19.845] - - [32, 100, 1, 2048] - - [1113, 323.794] + - [1116, 323.794] - - [512, 512, 1, 10] - - [1170, 420.103] + - [1173, 420.103] - - [512, 500, 1, 10] - - [1175, 376.471] + - [1178, 376.471] - - [16, 100, 1, 1024] - - [1059, 129.62] + - [1062, 129.62] - - [2, 500, 1, 10] - - [1023, 2.11864] + - [1026, 2.11864] - - [200, 512, 1, 10] - - [1025, 188.235] + - [1028, 188.235] - - [512, 1024, 1, 100] - - [1125, 3877.87] + - [1128, 3877.87] - - [16, 2000, 1, 2000] - - [1049, 2222.22] + - [1052, 2222.22] - - [500, 500, 1, 1024] - - [1145, 6130.27] + - [1148, 6130.27] - - [500, 100, 1, 2048] - - [1200, 2949.31] + - [1203, 2949.31] - - [256, 1024, 1, 512] - - [1164, 5886.74] + - [1167, 5886.74] - - [256, 500, 1, 512] - - [1142, 4380.75] + - [1145, 4380.75] - - [16, 1024, 1, 2000] - - [1113, 1208.26] + - [1116, 1208.26] - - [200, 500, 1, 2048] - - [1225, 3855.42] + - [1228, 3855.42] - - [256, 2000, 1, 10] - - [1127, 727.273] + - [1130, 727.273] - - [10, 2048, 1, 2048] - - [1080, 823.058] + - [1083, 823.058] - - [512, 2000, 1, 100] - - [1129, 5120.0] + - [1132, 5120.0] - - [10, 1024, 1, 1024] - - [1056, 553.046] + - [1059, 553.046] - - [512, 2000, 1, 2048] - - [1157, 7563.3] + - [1160, 7563.3] - - [500, 1024, 1, 500] - - [1181, 6570.84] + - [1184, 6570.84] - - [500, 100, 1, 512] - - [1200, 2038.22] + - [1203, 2038.22] - - [256, 2000, 1, 100] - - [1149, 3764.71] + - [1152, 3764.71] - - [512, 1024, 1, 2048] - - [1193, 7286.52] + - [1196, 7286.52] - - [32, 512, 1, 500] - - [1049, 898.246] + - [1052, 898.246] - - [100, 2000, 1, 10] - - [1041, 333.333] + - [1044, 333.333] - - [100, 500, 1, 512] - - [1219, 2176.87] + - [1222, 2176.87] - - [8, 2000, 1, 512] - - [1098, 602.353] + - [1101, 602.353] - - [100, 2048, 1, 2048] - - [1205, 3694.77] + - [1208, 3694.77] - - [128, 1024, 1, 2048] - - [1224, 4168.25] + - [1227, 4168.25] - - [8, 500, 1, 2000] - - [1123, 352.113] + - [1126, 352.113] - - [100, 2000, 1, 500] - - [1173, 4045.31] + - [1176, 4045.31] - - [100, 2048, 1, 100] - - [1173, 2081.3] + - [1176, 2081.3] - - [4, 100, 1, 1024] - - [1049, 33.0323] + - [1052, 33.0323] - - [500, 2048, 1, 2048] - - [1157, 7764.93] + - [1160, 7764.93] - - [2, 2000, 1, 2048] - - [1068, 166.234] + - [1071, 166.234] - - [200, 2048, 1, 10] - - [1042, 609.524] + - [1045, 609.524] - - [2, 500, 1, 1024] - - [1113, 75.2941] + - [1116, 75.2941] - - [100, 500, 1, 1024] - - [1109, 1975.31] + - [1112, 1975.31] - - [16, 2048, 1, 500] - - [1049, 1473.38] + - [1052, 1473.38] - - [100, 1024, 1, 10] - - [1209, 185.507] + - [1212, 185.507] - - [8, 2048, 1, 1024] - - [1105, 543.304] + - [1108, 543.304] - - [2, 2000, 1, 500] - - [1049, 179.856] + - [1052, 179.856] - - [32, 100, 1, 1024] - - [1049, 267.712] + - [1052, 267.712] - - [500, 2000, 1, 512] - - [1179, 7087.49] + - [1182, 7087.49] - - [64, 100, 1, 2000] - - [1059, 615.385] + - [1062, 615.385] - - [100, 1024, 1, 2000] - - [1222, 4224.42] + - [1225, 4224.42] - - [64, 500, 1, 10] - - [1024, 63.4921] + - [1027, 63.4921] - - [32, 2048, 1, 100] - - [1045, 941.609] + - [1048, 941.609] - - [64, 500, 1, 512] - - [1049, 1575.38] + - [1052, 1575.38] - - [10, 100, 1, 1024] - - [1059, 82.5806] + - [1062, 82.5806] - - [16, 512, 1, 100] - - [1048, 148.406] + - [1051, 148.406] - - [4, 100, 1, 2000] - - [1122, 43.8597] + - [1125, 43.8597] - - [2, 512, 1, 1024] - - [1113, 74.052] + - [1116, 74.052] - - [64, 512, 1, 1024] - - [1114, 1570.9] + - [1117, 1570.9] - - [10, 2048, 1, 500] - - [1049, 920.863] + - [1052, 920.863] - - [4, 2000, 1, 2048] - - [1068, 326.115] + - [1071, 326.115] - - [512, 100, 1, 2048] - - [1203, 3084.05] + - [1206, 3084.05] - - [32, 100, 1, 2000] - - [1049, 343.348] + - [1052, 343.348] - - [256, 512, 1, 500] - - [1142, 4311.58] + - [1145, 4311.58] - - [100, 2000, 1, 100] - - [1173, 2016.13] + - [1176, 2016.13] - - [8, 2000, 1, 1024] - - [1062, 544.681] + - [1065, 544.681] - - [4, 512, 1, 500] - - [1049, 118.519] + - [1052, 118.519] - - [128, 1024, 1, 10] - - [1212, 244.537] + - [1215, 244.537] - - [4, 500, 1, 1024] - - [1049, 144.633] + - [1052, 144.633] - - [32, 2048, 1, 512] - - [1052, 2139.95] + - [1055, 2139.95] - - [32, 100, 1, 10] - - [1027, 7.01754] + - [1030, 7.01754] - - [100, 2048, 1, 10] - - [1216, 341.333] + - [1219, 341.333] - - [512, 500, 1, 100] - - [1177, 2461.54] + - [1180, 2461.54] - - [128, 2000, 1, 1024] - - [1161, 4174.27] + - [1164, 4174.27] - - [200, 1024, 1, 500] - - [1173, 4295.3] + - [1176, 4295.3] - - [32, 2048, 1, 1024] - - [1076, 1667.72] + - [1079, 1667.72] - - [10, 1024, 1, 2048] - - [1067, 555.39] + - [1070, 555.39] - - [8, 500, 1, 100] - - [1048, 71.4286] + - [1051, 71.4286] - - [32, 2048, 1, 500] - - [1052, 2528.4] + - [1055, 2528.4] - - [200, 100, 1, 1024] - - [1061, 1071.13] + - [1064, 1071.13] - - [16, 100, 1, 100] - - [1038, 28.5714] + - [1041, 28.5714] - - [8, 1024, 1, 2000] - - [1122, 654.313] + - [1125, 654.313] - - [4, 512, 1, 100] - - [1048, 36.5714] + - [1051, 36.5714] - - [16, 500, 1, 100] - - [1048, 142.857] + - [1051, 142.857] - - [8, 1024, 1, 2048] - - [1074, 441.506] + - [1077, 441.506] - - [16, 1024, 1, 2048] - - [1075, 886.745] + - [1078, 886.745] - - [10, 2048, 1, 1024] - - [1053, 639.376] + - [1056, 639.376] - - [64, 512, 1, 100] - - [1048, 518.481] + - [1051, 518.481] - - [2, 100, 1, 500] - - [1049, 9.61538] + - [1052, 9.61538] - - [2, 500, 1, 512] - - [1055, 48.1203] + - [1058, 48.1203] - - [256, 512, 1, 2000] - - [1158, 6450.39] + - [1161, 6450.39] - - [128, 500, 1, 1024] - - [1052, 2497.56] + - [1055, 2497.56] - - [10, 100, 1, 10] - - [1089, 2.23214] + - [1092, 2.23214] - - [8, 2048, 1, 2048] - - [1039, 643.298] + - [1042, 643.298] - - [16, 2048, 1, 2048] - - [1079, 1337.9] + - [1082, 1337.9] - - [64, 1024, 1, 10] - - [1042, 132.129] + - [1045, 132.129] - - [500, 100, 1, 500] - - [1200, 1940.99] + - [1203, 1940.99] - - [256, 1024, 1, 2000] - - [1196, 7629.34] + - [1199, 7629.34] - - [200, 512, 1, 500] - - [1185, 3232.32] + - [1188, 3232.32] - - [8, 2000, 1, 10] - - [1086, 32.2581] + - [1089, 32.2581] - - [64, 2000, 1, 512] - - [1184, 3225.2] + - [1187, 3225.2] - - [2, 512, 1, 100] - - [1028, 16.6234] + - [1031, 16.6234] - - [4, 2000, 1, 2000] - - [1049, 586.51] + - [1052, 586.51] - - [200, 1024, 1, 100] - - [1173, 2133.33] + - [1176, 2133.33] - - [16, 100, 1, 500] - - [1113, 92.5926] + - [1116, 92.5926] - - [128, 100, 1, 500] - - [1109, 526.316] + - [1112, 526.316] - - [500, 1024, 1, 1024] - - [1143, 7201.76] + - [1146, 7201.76] - - [200, 1024, 1, 1024] - - [1195, 4519.72] + - [1198, 4519.72] - - [8, 2048, 1, 512] - - [1059, 624.152] + - [1062, 624.152] - - [200, 2000, 1, 500] - - [1149, 5186.72] + - [1152, 5186.72] - - [512, 100, 1, 1024] - - [1200, 2742.09] + - [1203, 2742.09] - - [16, 100, 1, 2000] - - [1059, 168.776] + - [1062, 168.776] - - [500, 512, 1, 2000] - - [1196, 7289.29] + - [1199, 7289.29] - - [8, 2000, 1, 2048] - - [1070, 668.189] + - [1073, 668.189] - - [256, 2048, 1, 100] - - [1131, 3924.31] + - [1134, 3924.31] - - [32, 2048, 1, 2000] - - [1063, 3882.46] + - [1066, 3882.46] - - [200, 500, 1, 512] - - [1188, 3368.42] + - [1191, 3368.42] - - [10, 512, 1, 100] - - [1048, 91.4286] + - [1051, 91.4286] - - [16, 2000, 1, 10] - - [1026, 61.5385] + - [1029, 61.5385] - - [8, 512, 1, 100] - - [1048, 72.1127] + - [1051, 72.1127] - - [256, 512, 1, 512] - - [1153, 4583.94] + - [1156, 4583.94] - - [500, 2000, 1, 1024] - - [1128, 7569.49] + - [1131, 7569.49] - - [512, 512, 1, 500] - - [1144, 5708.71] + - [1147, 5708.71] - - [256, 2048, 1, 1024] - - [1168, 5923.11] + - [1171, 5923.11] - - [8, 2048, 1, 2000] - - [1049, 1153.8] + - [1052, 1153.8] - - [100, 512, 1, 2048] - - [1115, 2383.13] + - [1118, 2383.13] - - [100, 1024, 1, 512] - - [1200, 3343.67] + - [1203, 3343.67] - - [128, 100, 1, 2000] - - [1218, 1084.75] + - [1221, 1084.75] - - [4, 2048, 1, 2048] - - [1067, 332.354] + - [1070, 332.354] - - [2, 1024, 1, 2000] - - [1078, 161.006] + - [1081, 161.006] - - [100, 512, 1, 512] - - [1052, 2184.53] + - [1055, 2184.53] - - [128, 1024, 1, 1024] - - [1195, 3847.99] + - [1198, 3847.99] - - [200, 2048, 1, 1024] - - [1130, 4547.16] + - [1133, 4547.16] - - [32, 1024, 1, 2000] - - [1059, 2416.52] + - [1062, 2416.52] - - [128, 500, 1, 100] - - [1054, 919.54] + - [1057, 919.54] - - [200, 512, 1, 2000] - - [1222, 4238.41] + - [1225, 4238.41] - - [10, 2048, 1, 2000] - - [1059, 1454.55] + - [1062, 1454.55] - - [256, 1024, 1, 500] - - [1156, 5669.2] + - [1159, 5669.2] - - [100, 100, 1, 100] - - [1048, 171.233] + - [1051, 171.233] - - [8, 512, 1, 1024] - - [1117, 286.496] + - [1120, 286.496] - - [200, 1024, 1, 512] - - [1173, 4354.55] + - [1176, 4354.55] - - [256, 500, 1, 500] - - [1158, 4020.1] + - [1161, 4020.1] - - [200, 100, 1, 500] - - [1222, 702.247] + - [1225, 702.247] - - [2, 1024, 1, 2048] - - [1068, 112.75] + - [1071, 112.75] - - [256, 500, 1, 2048] - - [1158, 5041.23] + - [1161, 5041.23] - - [512, 2048, 1, 500] - - [1151, 7710.12] + - [1154, 7710.12] - - [512, 100, 1, 2000] - - [1200, 3099.27] + - [1203, 3099.27] - - [512, 500, 1, 1024] - - [1159, 6463.12] + - [1162, 6463.12] - - [16, 512, 1, 2000] - - [1075, 721.127] + - [1078, 721.127] - - [64, 500, 1, 1024] - - [1114, 1528.36] + - [1117, 1528.36] - - [512, 2000, 1, 10] - - [1135, 1174.31] + - [1138, 1174.31] - - [256, 512, 1, 1024] - - [1153, 4978.4] + - [1156, 4978.4] - - [10, 512, 1, 1024] - - [1113, 370.26] + - [1116, 370.26] - - [512, 100, 1, 100] - - [1136, 659.794] + - [1139, 659.794] - - [8, 2000, 1, 100] - - [1048, 256.41] + - [1051, 256.41] - - [128, 2048, 1, 1024] - - [1161, 4173.44] + - [1164, 4173.44] - - [2, 2000, 1, 2000] - - [1049, 250.627] + - [1052, 250.627] - - [16, 2048, 1, 1024] - - [1096, 1045.96] + - [1099, 1045.96] - - [500, 512, 1, 500] - - [1141, 5517.24] + - [1144, 5517.24] - - [8, 100, 1, 1024] - - [1114, 64.0] + - [1117, 64.0] - - [10, 100, 1, 100] - - [1038, 17.8571] + - [1041, 17.8571] - - [200, 500, 1, 500] - - [1188, 3140.7] + - [1191, 3140.7] - - [10, 500, 1, 2000] - - [1075, 444.84] + - [1078, 444.84] - - [500, 100, 1, 2000] - - [1203, 2969.12] + - [1206, 2969.12] - - [100, 512, 1, 2000] - - [1115, 2776.57] + - [1118, 2776.57] - - [500, 1024, 1, 2000] - - [1194, 8020.05] + - [1197, 8020.05] - - [32, 2000, 1, 2000] - - [1055, 3827.75] + - [1058, 3827.75] - - [64, 1024, 1, 512] - - [1219, 2573.19] + - [1222, 2573.19] - - [64, 2000, 1, 2000] - - [1188, 5797.1] + - [1191, 5797.1] - - [32, 500, 1, 100] - - [1048, 266.667] + - [1051, 266.667] - - [128, 2000, 1, 2048] - - [1204, 4547.95] + - [1207, 4547.95] - - [10, 100, 1, 2048] - - [1113, 98.4615] + - [1116, 98.4615] - - [32, 2048, 1, 2048] - - [1076, 2213.35] + - [1079, 2213.35] - - [64, 100, 1, 100] - - [1049, 96.3855] + - [1052, 96.3855] - - [2, 1024, 1, 100] - - [1099, 34.5946] + - [1102, 34.5946] - - [256, 1024, 1, 10] - - [1169, 425.558] + - [1172, 425.558] - - [256, 1024, 1, 1024] - - [1162, 5482.75] + - [1165, 5482.75] - - [64, 500, 1, 2000] - - [1049, 2056.56] + - [1052, 2056.56] - - [512, 2000, 1, 512] - - [1147, 7550.23] + - [1150, 7550.23] - - [8, 512, 1, 512] - - [1056, 231.986] + - [1059, 231.986] - - [8, 512, 1, 2048] - - [1049, 290.464] + - [1052, 290.464] - - [100, 100, 1, 1024] - - [1219, 624.39] + - [1222, 624.39] - - [2, 2048, 1, 10] - - [1092, 8.82759] + - [1095, 8.82759] - - [4, 2048, 1, 512] - - [1098, 312.076] + - [1101, 312.076] - - [4, 2048, 1, 10] - - [1091, 17.9649] + - [1094, 17.9649] - - [8, 100, 1, 2000] - - [1068, 85.8369] + - [1071, 85.8369] - - [2, 1024, 1, 1024] - - [1065, 101.214] + - [1068, 101.214] - - [16, 2048, 1, 100] - - [1049, 518.481] + - [1052, 518.481] - - [16, 512, 1, 512] - - [1059, 455.903] + - [1062, 455.903] - - [32, 500, 1, 512] - - [1056, 906.195] + - [1059, 906.195] - - [500, 2000, 1, 2000] - - [1151, 8143.32] + - [1154, 8143.32] - - [500, 1024, 1, 10] - - [1132, 680.851] + - [1135, 680.851] - - [32, 500, 1, 1024] - - [1108, 1008.87] + - [1111, 1008.87] - - [32, 500, 1, 10] - - [1044, 33.3333] + - [1047, 33.3333] - - [500, 500, 1, 10] - - [1173, 367.647] + - [1176, 367.647] - - [4, 2000, 1, 500] - - [1059, 370.37] + - [1062, 370.37] - - [10, 2000, 1, 500] - - [1049, 899.281] + - [1052, 899.281] - - [32, 2000, 1, 512] - - [1061, 2089.8] + - [1064, 2089.8] - - [256, 500, 1, 100] - - [1174, 1495.33] + - [1177, 1495.33] - - [256, 2048, 1, 10] - - [1132, 789.59] + - [1135, 789.59] - - [4, 1024, 1, 500] - - [1049, 222.609] + - [1052, 222.609] - - [256, 512, 1, 2048] - - [1158, 5292.5] + - [1161, 5292.5] - - [2, 2000, 1, 1024] - - [1096, 137.265] + - [1099, 137.265] - - [256, 100, 1, 512] - - [1200, 1085.03] + - [1203, 1085.03] - - [8, 1024, 1, 500] - - [1049, 441.379] + - [1052, 441.379] - - [256, 2048, 1, 500] - - [1179, 7031.76] + - [1182, 7031.76] - - [256, 2048, 1, 2048] - - [1142, 6771.83] + - [1145, 6771.83] - - [2, 2000, 1, 512] - - [1103, 159.006] + - [1106, 159.006] - - [256, 2000, 1, 512] - - [1146, 6527.49] + - [1149, 6527.49] - - [4, 1024, 1, 100] - - [1095, 70.137] + - [1098, 70.137] - - [512, 1024, 1, 2000] - - [1180, 8295.7] + - [1183, 8295.7] - - [100, 500, 1, 500] - - [1052, 2016.13] + - [1055, 2016.13] - - [4, 2048, 1, 1024] - - [1100, 284.939] + - [1103, 284.939] - - [2, 1024, 1, 500] - - [1049, 109.402] + - [1052, 109.402] - - [64, 100, 1, 500] - - [1049, 296.296] + - [1052, 296.296] - - [256, 2000, 1, 2000] - - [1157, 8152.87] + - [1160, 8152.87] - - [2, 512, 1, 500] - - [1055, 44.7552] + - [1058, 44.7552] - - [8, 2048, 1, 500] - - [1049, 736.691] + - [1052, 736.691] - - [10, 1024, 1, 500] - - [1049, 547.009] + - [1052, 547.009] - - [4, 2048, 1, 2000] - - [1059, 604.13] + - [1062, 604.13] - - [200, 1024, 1, 2000] - - [1226, 5400.84] + - [1229, 5400.84] - - [128, 500, 1, 512] - - [1219, 2730.67] + - [1222, 2730.67] - - [10, 500, 1, 2048] - - [1113, 359.551] + - [1116, 359.551] - - [256, 2048, 1, 2000] - - [1157, 8375.21] + - [1160, 8375.21] - - [8, 2000, 1, 2000] - - [1059, 1146.13] + - [1062, 1146.13] - - [100, 2048, 1, 512] - - [1182, 3936.1] + - [1185, 3936.1] - - [512, 500, 1, 2048] - - [1158, 6756.29] + - [1161, 6756.29] - - [200, 2048, 1, 100] - - [1149, 3180.12] + - [1152, 3180.12] - - [128, 512, 1, 512] - - [1052, 2872.81] + - [1055, 2872.81] - - [200, 2000, 1, 2048] - - [1198, 4818.82] + - [1201, 4818.82] - - [4, 2000, 1, 1024] - - [1096, 275.269] + - [1099, 275.269] - - [64, 512, 1, 10] - - [1211, 69.4237] + - [1214, 69.4237] - - [32, 500, 1, 2000] - - [1078, 1246.11] + - [1081, 1246.11] - - [128, 2048, 1, 2000] - - [1191, 7233.55] + - [1194, 7233.55] - - [100, 100, 1, 2048] - - [1049, 790.123] + - [1052, 790.123] - - [500, 2048, 1, 512] - - [1179, 7249.56] + - [1182, 7249.56] - - [200, 100, 1, 512] - - [1055, 748.538] + - [1058, 748.538] - - [32, 2000, 1, 100] - - [1050, 930.233] + - [1053, 930.233] - - [500, 512, 1, 2048] - - [1201, 6639.92] + - [1204, 6639.92] - - [500, 2000, 1, 500] - - [1181, 7078.14] + - [1184, 7078.14] - - [200, 100, 1, 2048] - - [1059, 1387.53] + - [1062, 1387.53] - - [2, 2048, 1, 100] - - [1093, 64.8101] + - [1096, 64.8101] - - [8, 100, 1, 10] - - [1034, 1.75439] + - [1037, 1.75439] - - [200, 2048, 1, 2048] - - [1198, 5021.92] + - [1201, 5021.92] - - [200, 2048, 1, 500] - - [1149, 5355.65] + - [1152, 5355.65] - - [100, 100, 1, 500] - - [1219, 416.667] + - [1222, 416.667] - - [8, 2048, 1, 10] - - [1090, 34.7119] + - [1093, 34.7119] - - [100, 500, 1, 10] - - [1030, 93.2836] + - [1033, 93.2836] - - [200, 500, 1, 2000] - - [1222, 4152.82] + - [1225, 4152.82] - - [512, 2000, 1, 500] - - [1151, 7485.38] + - [1154, 7485.38] - - [10, 500, 1, 1024] - - [1117, 363.636] + - [1120, 363.636] - - [256, 100, 1, 10] - - [1166, 41.0256] + - [1169, 41.0256] - - [500, 512, 1, 1024] - - [1145, 6362.72] + - [1148, 6362.72] - - [200, 2048, 1, 2000] - - [1187, 6320.99] + - [1190, 6320.99] - - [100, 1024, 1, 100] - - [1186, 1306.12] + - [1189, 1306.12] - - [500, 1024, 1, 100] - - [1125, 3699.42] + - [1128, 3699.42] - - [10, 512, 1, 2048] - - [1049, 361.08] + - [1052, 361.08] - - [2, 1024, 1, 512] - - [1098, 105.703] + - [1101, 105.703] - - [4, 500, 1, 2048] - - [1121, 143.417] + - [1124, 143.417] - - [100, 512, 1, 100] - - [1054, 744.186] + - [1057, 744.186] - - [16, 500, 1, 512] - - [1049, 453.097] + - [1052, 453.097] - - [10, 1024, 1, 100] - - [1047, 166.234] + - [1050, 166.234] - - [8, 1024, 1, 100] - - [1095, 140.274] + - [1098, 140.274] - - [64, 2000, 1, 500] - - [1190, 3940.89] + - [1193, 3940.89] - - [64, 1024, 1, 2000] - - [1055, 3531.03] + - [1058, 3531.03] - - [10, 100, 1, 512] - - [1049, 61.5385] + - [1052, 61.5385] - - [4, 500, 1, 2000] - - [1075, 173.01] + - [1078, 173.01] - - [512, 1024, 1, 10] - - [1126, 736.36] + - [1129, 736.36] - - [128, 2048, 1, 2048] - - [1189, 4596.5] + - [1192, 4596.5] - - [4, 100, 1, 100] - - [1038, 7.14286] + - [1041, 7.14286] - - [32, 1024, 1, 512] - - [1098, 1519.68] + - [1101, 1519.68] - - [8, 512, 1, 2000] - - [1123, 356.794] + - [1126, 356.794] - - [100, 100, 1, 512] - - [1063, 426.667] + - [1066, 426.667] - - [2, 2048, 1, 2048] - - [1072, 170.778] + - [1075, 170.778] - - [2, 512, 1, 2000] - - [1075, 90.7801] + - [1078, 90.7801] - - [16, 500, 1, 10] - - [1048, 18.1818] + - [1051, 18.1818] - - [10, 500, 1, 100] - - [1048, 88.0282] + - [1051, 88.0282] - - [4, 100, 1, 500] - - [1113, 23.5849] + - [1116, 23.5849] - - [512, 1024, 1, 1024] - - [1165, 7431.77] + - [1168, 7431.77] - - [64, 500, 1, 100] - - [1058, 506.329] + - [1061, 506.329] - - [128, 2000, 1, 10] - - [1216, 432.432] + - [1219, 432.432] - - [10, 2000, 1, 2048] - - [1079, 806.299] + - [1082, 806.299] - - [2, 100, 1, 100] - - [1036, 3.125] + - [1039, 3.125] - - [10, 512, 1, 2000] - - [1068, 462.094] + - [1071, 462.094] - - [8, 500, 1, 500] - - [1049, 231.481] + - [1052, 231.481] - - [4, 500, 1, 512] - - [1049, 118.519] + - [1052, 118.519] - - [10, 500, 1, 10] - - [1043, 10.9649] + - [1046, 10.9649] - - [64, 512, 1, 2000] - - [1049, 2116.8] + - [1052, 2116.8] - - [500, 512, 1, 10] - - [1170, 395.062] + - [1173, 395.062] - - [200, 512, 1, 512] - - [1188, 3449.26] + - [1191, 3449.26] - - [512, 500, 1, 500] - - [1144, 5536.33] + - [1147, 5536.33] - - [32, 512, 1, 2000] - - [1059, 1264.2] + - [1062, 1264.2] - - [128, 500, 1, 2048] - - [1115, 3006.24] + - [1118, 3006.24] - - [500, 2048, 1, 10] - - [1140, 1049.18] + - [1143, 1049.18] - - [512, 512, 1, 100] - - [1177, 2664.06] + - [1180, 2664.06] - - [200, 2000, 1, 512] - - [1185, 5192.7] + - [1188, 5192.7] - - [500, 500, 1, 512] - - [1141, 5673.76] + - [1144, 5673.76] - - [128, 2048, 1, 500] - - [1173, 5251.28] + - [1176, 5251.28] - - [4, 512, 1, 512] - - [1049, 123.653] + - [1052, 123.653] - - [16, 2048, 1, 2000] - - [1065, 2294.68] + - [1068, 2294.68] - - [16, 500, 1, 1024] - - [1049, 562.637] + - [1052, 562.637] - - [256, 2000, 1, 500] - - [1179, 6639.0] + - [1182, 6639.0] - - [10, 1024, 1, 10] - - [1029, 20.9836] + - [1032, 20.9836] - - [16, 500, 1, 500] - - [1049, 446.429] + - [1052, 446.429] - - [10, 2048, 1, 512] - - [1047, 784.862] + - [1050, 784.862] - - [200, 500, 1, 10] - - [1022, 176.056] + - [1025, 176.056] - - [256, 2048, 1, 512] - - [1176, 6540.83] + - [1179, 6540.83] - - [256, 2000, 1, 2048] - - [1153, 6670.33] + - [1156, 6670.33] - - [500, 2048, 1, 500] - - [1181, 7264.47] + - [1184, 7264.47] - - [500, 100, 1, 1024] - - [1203, 2700.42] + - [1206, 2700.42] - - [16, 100, 1, 512] - - [1113, 96.6038] + - [1116, 96.6038] - - [64, 512, 1, 2048] - - [1114, 1868.29] + - [1117, 1868.29] - - [32, 1024, 1, 10] - - [1025, 69.4237] + - [1028, 69.4237] - - [16, 2048, 1, 512] - - [1098, 1226.4] + - [1101, 1226.4] - - [8, 1024, 1, 512] - - [1098, 416.102] + - [1101, 416.102] - - [4, 1024, 1, 2048] - - [1120, 223.101] + - [1123, 223.101] - - [100, 2048, 1, 2000] - - [1193, 5614.04] + - [1196, 5614.04] - - [512, 512, 1, 2048] - - [1158, 6868.87] + - [1161, 6868.87] - - [256, 2000, 1, 1024] - - [1149, 5758.88] + - [1152, 5758.88] - - [64, 512, 1, 512] - - [1218, 1651.3] + - [1221, 1651.3] - - [200, 1024, 1, 10] - - [1032, 341.333] + - [1035, 341.333] - - [128, 500, 1, 500] - - [1061, 2580.65] + - [1064, 2580.65] - - [100, 512, 1, 1024] - - [1052, 2041.62] + - [1055, 2041.62] - - [16, 1024, 1, 500] - - [1049, 867.797] + - [1052, 867.797] - - [128, 100, 1, 2048] - - [1219, 1011.36] + - [1222, 1011.36] - - [100, 512, 1, 500] - - [1052, 2051.28] + - [1055, 2051.28] - - [8, 1024, 1, 1024] - - [1065, 424.525] + - [1068, 424.525] - - [2, 2000, 1, 10] - - [1091, 8.47458] + - [1094, 8.47458] - - [4, 500, 1, 10] - - [1088, 4.46429] + - [1091, 4.46429] - - [500, 2000, 1, 2048] - - [1165, 7444.02] + - [1168, 7444.02] - - [4, 2000, 1, 100] - - [1101, 128.205] + - [1104, 128.205] - - [512, 2000, 1, 2000] - - [1151, 8454.43] + - [1154, 8454.43] - - [128, 500, 1, 10] - - [1210, 117.647] + - [1213, 117.647] - - [32, 1024, 1, 100] - - [1058, 512.0] + - [1061, 512.0] - - [8, 500, 1, 2048] - - [1073, 286.835] + - [1076, 286.835] - - [16, 1024, 1, 1024] - - [1037, 881.156] + - [1040, 881.156] - - [200, 100, 1, 10] - - [1209, 40.3226] + - [1212, 40.3226] - - [512, 100, 1, 500] - - [1203, 1987.58] + - [1206, 1987.58] - - [512, 2048, 1, 2048] - - [1160, 8063.55] + - [1163, 8063.55] - - [16, 2000, 1, 512] - - [1059, 1204.71] + - [1062, 1204.71] - - [64, 2048, 1, 1024] - - [1057, 2853.27] + - [1060, 2853.27] - - [32, 2048, 1, 10] - - [1031, 130.032] + - [1034, 130.032] - - [10, 2048, 1, 10] - - [1033, 39.3846] + - [1036, 39.3846] - - [4, 2000, 1, 512] - - [1049, 316.049] + - [1052, 316.049] - - [4, 500, 1, 100] - - [1048, 35.7143] + - [1051, 35.7143] - - [8, 100, 1, 2048] - - [1068, 84.6281] + - [1071, 84.6281] - - [512, 2048, 1, 10] - - [1148, 1224.97] + - [1151, 1224.97] - - [512, 100, 1, 10] - - [1137, 90.1408] + - [1140, 90.1408] - - [4, 512, 1, 1024] - - [1049, 143.248] + - [1052, 143.248] - - [16, 2048, 1, 10] - - [1082, 65.0159] + - [1085, 65.0159] - - [500, 2000, 1, 100] - - [1133, 4716.98] + - [1136, 4716.98] - - [32, 1024, 1, 2048] - - [1076, 1582.76] + - [1079, 1582.76] - - [100, 2000, 1, 2000] - - [1193, 5512.68] + - [1196, 5512.68] - - [128, 100, 1, 512] - - [1219, 561.096] + - [1222, 561.096] - - [500, 500, 1, 100] - - [1173, 2460.63] + - [1176, 2460.63] - - [32, 2000, 1, 10] - - [1025, 119.403] + - [1028, 119.403] - - [128, 2048, 1, 100] - - [1173, 2708.1] + - [1176, 2708.1] - - [10, 2000, 1, 100] - - [1048, 316.456] + - [1051, 316.456] - - [2, 2048, 1, 500] - - [1059, 191.045] + - [1062, 191.045] - - [32, 1024, 1, 500] - - [1059, 1563.36] + - [1062, 1563.36] - - [4, 1024, 1, 10] - - [1088, 9.14286] + - [1091, 9.14286] - - [100, 512, 1, 10] - - [1214, 96.9697] + - [1217, 96.9697] - - [8, 100, 1, 100] - - [1064, 14.2857] + - [1067, 14.2857] - - [128, 512, 1, 500] - - [1052, 2677.12] + - [1055, 2677.12] - - [16, 100, 1, 2048] - - [1075, 161.897] + - [1078, 161.897] - - [2, 1024, 1, 10] - - [1088, 4.49123] + - [1091, 4.49123] - - [4, 100, 1, 2048] - - [1068, 41.7959] + - [1071, 41.7959] - - [4, 512, 1, 2000] - - [1068, 180.282] + - [1071, 180.282] - - [4096, 64, 1, 2048] - - [1268, 7247.18] + - [1271, 7247.18] - - [1024, 10080, 1, 1024] - - [1256, 9833.37] + - [1259, 9833.37] - - [1024, 1131, 1, 1024] - - [1234, 7551.85] + - [1237, 7551.85] - - [36548, 1216, 1, 1024] - - [1246, 10351.5] + - [1249, 10351.5] - - [1024, 29, 1, 1024] - - [1278, 1696.91] + - [1281, 1696.91] - - [1024, 2592, 1, 1024] - - [1247, 8424.01] + - [1250, 8424.01] - - [1024, 1568, 1, 1024] - - [1258, 7511.76] + - [1261, 7511.76] - - [4096, 91, 1, 2048] - - [1227, 5599.81] + - [1230, 5599.81] - - [1024, 4445, 1, 1024] - - [1245, 9261.12] + - [1248, 9261.12] - - [1024, 6272, 1, 1024] - - [1240, 9439.51] + - [1243, 9439.51] - - [36548, 3584, 1, 1024] - - [1239, 10393.7] + - [1242, 10393.7] - - [1024, 1827, 1, 1024] - - [1258, 8714.32] + - [1261, 8714.32] - - [1024, 3220, 1, 1024] - - [1238, 8861.1] + - [1241, 8861.1] - - [1024, 1856, 1, 1024] - - [1255, 8826.95] + - [1258, 8826.95] - - [1024, 1760, 1, 1024] - - [1255, 8334.1] + - [1258, 8334.1] - - [1024, 1600, 1, 1024] - - [1255, 7614.97] + - [1258, 7614.97] - - [1024, 1, 1, 21] - - [1259, 0.0] + - [1262, 0.0] - - [36548, 4235, 1, 1024] - - [1239, 10276.7] + - [1242, 10276.7] - - [1024, 49, 1, 1024] - - [1274, 2643.02] + - [1277, 2643.02] - - [1024, 1984, 1, 1024] - - [1258, 9449.42] + - [1261, 9449.42] - - [1024, 14720, 1, 1024] - - [1245, 10033.2] + - [1248, 10033.2] - - [1024, 1152, 1, 1024] - - [1228, 7523.44] + - [1231, 7523.44] - - [36548, 14976, 1, 1024] - - [1246, 10421.6] + - [1249, 10421.6] - - [36548, 1152, 1, 1024] - - [1246, 10258.0] + - [1249, 10258.0] - - [4096, 86, 1, 3072] - - [1227, 5308.75] + - [1230, 5308.75] - - [1024, 3392, 1, 1024] - - [1240, 9176.44] + - [1243, 9176.44] - - [1024, 1408, 1, 1024] - - [1240, 8958.73] + - [1243, 8958.73] - - [1024, 2080, 1, 1024] - - [1231, 8396.39] + - [1234, 8396.39] - - [1024, 1824, 1, 1024] - - [1249, 8671.61] + - [1252, 8671.61] - - [36548, 2432, 1, 1024] - - [1239, 10392.5] + - [1242, 10392.5] - - [4096, 29, 1, 2048] - - [1260, 4325.56] + - [1263, 4325.56] - - [1024, 1102, 1, 1024] - - [1234, 7204.08] + - [1237, 7204.08] - - [4096, 49, 1, 2048] - - [1266, 5609.19] + - [1269, 5609.19] - - [36548, 1827, 1, 1024] - - [1246, 10183.1] + - [1249, 10183.1] - - [4096, 25, 1, 2048] - - [1261, 3788.21] + - [1264, 3788.21] - - [1024, 10176, 1, 1024] - - [1256, 9941.08] + - [1259, 9941.08] - - [1024, 774, 1, 1024] - - [1241, 7079.57] + - [1244, 7079.57] - - [1024, 1952, 1, 1024] - - [1258, 9300.39] + - [1261, 9300.39] - - [4096, 128, 1, 2048] - - [1228, 8274.86] + - [1231, 8274.86] - - [1024, 17024, 1, 1024] - - [1238, 9960.62] + - [1241, 9960.62] - - [1024, 1472, 1, 1024] - - [1247, 9343.27] + - [1250, 9343.27] - - [36548, 4459, 1, 1024] - - [1239, 10358.0] + - [1242, 10358.0] - - [4096, 91, 1, 3072] - - [1233, 5509.29] + - [1236, 5509.29] - - [1024, 3712, 1, 1024] - - [1247, 9048.56] + - [1250, 9048.56] - - [4096, 64, 1, 3072] - - [1280, 7489.83] + - [1283, 7489.83] - - [4096, 29, 1, 3072] - - [1260, 4511.68] + - [1263, 4511.68] - - [4096, 128, 1, 3072] - - [1227, 8423.73] + - [1230, 8423.73] - - [36548, 12928, 1, 1024] - - [1246, 10426.0] + - [1249, 10426.0] - - [1024, 1632, 1, 1024] - - [1228, 7761.63] + - [1231, 7761.63] - - [1024, 1696, 1, 1024] - - [1253, 8107.19] + - [1256, 8107.19] - - [4096, 24, 1, 2048] - - [1260, 3663.15] + - [1263, 3663.15] - - [4096, 63, 1, 3072] - - [1269, 7175.27] + - [1272, 7175.27] - - [4096, 96, 1, 2048] - - [1228, 5866.18] + - [1231, 5866.18] - - [36548, 1764, 1, 1024] - - [1239, 10128.4] + - [1242, 10128.4] - - [4096, 32, 1, 2048] - - [1264, 4540.52] + - [1267, 4540.52] - - [1024, 35, 1, 1024] - - [1272, 1911.47] + - [1275, 1911.47] - - [1024, 1120, 1, 1024] - - [1227, 7289.03] + - [1230, 7289.03] - - [4096, 49, 1, 3072] - - [1266, 5751.52] + - [1269, 5751.52] - - [1024, 24, 1, 1024] - - [1272, 1391.92] + - [1275, 1391.92] - - [1024, 2944, 1, 1024] - - [1248, 9284.83] + - [1251, 9284.83] - - [36548, 14080, 1, 1024] - - [1239, 10441.3] + - [1242, 10441.3] - - [1024, 1, 1, 1024] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 1280, 1, 1024] - - [1227, 8244.36] + - [1230, 8244.36] - - [1024, 13440, 1, 1024] - - [1239, 9799.82] + - [1242, 9799.82] - - [1024, 1015, 1, 1024] - - [1247, 9187.75] + - [1250, 9187.75] - - [36548, 9120, 1, 1024] - - [1239, 10399.9] + - [1242, 10399.9] - - [36548, 1, 1, 1024] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 3008, 1, 1024] - - [1248, 9468.45] + - [1251, 9468.45] - - [1024, 2560, 1, 1024] - - [1245, 8879.21] + - [1248, 8879.21] - - [1024, 21, 1, 1024] - - [1271, 1234.31] + - [1274, 1234.31] - - [1024, 2208, 1, 1024] - - [1227, 8231.17] + - [1230, 8231.17] - - [1024, 96, 1, 1024] - - [1277, 3767.34] + - [1280, 3767.34] - - [4096, 86, 1, 2048] - - [1228, 5528.99] + - [1231, 5528.99] - - [4096, 96, 1, 3072] - - [1227, 6273.18] + - [1230, 6273.18] - - [1024, 1920, 1, 1024] - - [1257, 9118.09] + - [1260, 9118.09] - - [4096, 27, 1, 2048] - - [1260, 4073.6] + - [1263, 4073.6] - - [36548, 2496, 1, 1024] - - [1239, 10361.1] + - [1242, 10361.1] - - [1024, 1, 1, 14] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 91, 1, 1024] - - [1279, 3647.57] + - [1282, 3647.57] - - [1024, 2016, 1, 1024] - - [1255, 9560.14] + - [1258, 9560.14] - - [1024, 1184, 1, 1024] - - [1228, 7678.86] + - [1231, 7678.86] - - [4096, 1, 1, 2048] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 1664, 1, 1024] - - [1253, 7933.97] + - [1256, 7933.97] - - [1024, 11424, 1, 1024] - - [1245, 9777.81] + - [1248, 9777.81] - - [4096, 24, 1, 3072] - - [1263, 3813.0] + - [1266, 3813.0] - - [1024, 1216, 1, 1024] - - [1227, 7902.03] + - [1230, 7902.03] - - [36548, 3185, 1, 1024] - - [1239, 10336.6] + - [1242, 10336.6] - - [36548, 9216, 1, 1024] - - [1239, 10414.2] + - [1242, 10414.2] - - [1024, 3200, 1, 1024] - - [1245, 8846.91] + - [1248, 8846.91] - - [1024, 2656, 1, 1024] - - [1240, 8649.15] + - [1243, 8649.15] - - [1024, 2368, 1, 1024] - - [1240, 8873.06] + - [1243, 8873.06] - - [1024, 4459, 1, 1024] - - [1247, 9431.22] + - [1250, 9431.22] - - [1024, 3808, 1, 1024] - - [1247, 9263.62] + - [1250, 9263.62] - - [1024, 2336, 1, 1024] - - [1240, 8965.9] + - [1243, 8965.9] - - [4096, 27, 1, 3072] - - [1260, 4171.64] + - [1263, 4171.64] - - [1024, 2304, 1, 1024] - - [1237, 8601.28] + - [1240, 8601.28] - - [1024, 1560, 1, 1024] - - [1252, 7481.64] + - [1255, 7481.64] - - [4096, 35, 1, 3072] - - [1266, 4176.8] + - [1269, 4176.8] - - [1024, 2496, 1, 1024] - - [1243, 9092.76] + - [1246, 9092.76] - - [1024, 1504, 1, 1024] - - [1243, 9220.43] + - [1246, 9220.43] - - [4096, 50, 1, 2048] - - [1267, 5472.73] + - [1270, 5472.73] - - [1024, 3232, 1, 1024] - - [1240, 8961.84] + - [1243, 8961.84] - - [1024, 14, 1, 1024] - - [1271, 882.215] + - [1274, 882.215] - - [36548, 1015, 1, 1024] - - [1239, 10140.8] + - [1242, 10140.8] - - [1024, 2000, 1, 1024] - - [1251, 9487.7] + - [1254, 9487.7] - - [36548, 243, 1, 1024] - - [1244, 9441.02] + - [1247, 9441.02] - - [36548, 32, 1, 1024] - - [1232, 4720.95] + - [1235, 4720.95] - - [1024, 25, 1, 1024] - - [1278, 1462.86] + - [1281, 1462.86] - - [1024, 13184, 1, 1024] - - [1242, 9866.18] + - [1245, 9866.18] - - [1024, 2688, 1, 1024] - - [1237, 8559.83] + - [1240, 8559.83] - - [1024, 27, 1, 1024] - - [1276, 1559.01] + - [1279, 1559.01] - - [36548, 950, 1, 1024] - - [1246, 10053.5] + - [1249, 10053.5] - - [1024, 1764, 1, 1024] - - [1253, 8347.01] + - [1256, 8347.01] - - [1024, 992, 1, 1024] - - [1240, 9035.72] + - [1243, 9035.72] - - [1024, 1376, 1, 1024] - - [1240, 8797.86] + - [1243, 8797.86] - - [1024, 950, 1, 1024] - - [1247, 8635.16] + - [1250, 8635.16] - - [36548, 774, 1, 1024] - - [1239, 9460.72] + - [1242, 9460.72] - - [36548, 25, 1, 1024] - - [1232, 3694.06] + - [1235, 3694.06] - - [1024, 4256, 1, 1024] - - [1240, 9172.06] + - [1243, 9172.06] - - [4096, 32, 1, 3072] - - [1261, 4886.57] + - [1264, 4886.57] - - [1024, 243, 1, 1024] - - [1265, 6594.31] + - [1268, 6594.31] - - [36548, 3712, 1, 1024] - - [1239, 10401.5] + - [1242, 10401.5] - - [1024, 50, 1, 1024] - - [1274, 2742.09] + - [1277, 2742.09] - - [1024, 3360, 1, 1024] - - [1236, 9017.27] + - [1239, 9017.27] - - [1024, 2048, 1, 1024] - - [1251, 9736.55] + - [1254, 9736.55] - - [1024, 2784, 1, 1024] - - [1247, 8835.5] + - [1250, 8835.5] - - [1024, 4992, 1, 1024] - - [1245, 9639.28] + - [1248, 9639.28] - - [36548, 1102, 1, 1024] - - [1246, 9858.94] + - [1249, 9858.94] - - [1024, 1536, 1, 1024] - - [1238, 9294.88] + - [1241, 9294.88] - - [1024, 2720, 1, 1024] - - [1243, 8617.78] + - [1246, 8617.78] - - [4096, 1, 1, 3072] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 2752, 1, 1024] - - [1247, 8902.07] + - [1250, 8902.07] - - [1024, 2816, 1, 1024] - - [1245, 8906.85] + - [1248, 8906.85] - - [1024, 2624, 1, 1024] - - [1247, 8494.31] + - [1250, 8494.31] - - [1024, 2144, 1, 1024] - - [1230, 8243.46] + - [1233, 8243.46] - - [36548, 1131, 1, 1024] - - [1246, 10104.5] + - [1249, 10104.5] - - [4096, 25, 1, 3072] - - [1261, 3959.88] + - [1264, 3959.88] - - [1024, 64, 1, 1024] - - [1274, 3410.0] + - [1277, 3410.0] - - [1024, 3296, 1, 1024] - - [1245, 9066.42] + - [1248, 9066.42] - - [36548, 4992, 1, 1024] - - [1239, 10395.5] + - [1242, 10395.5] - - [1024, 1344, 1, 1024] - - [1240, 8522.56] + - [1243, 8522.56] - - [36548, 2401, 1, 1024] - - [1239, 10250.2] + - [1242, 10250.2] - - [1024, 15744, 1, 1024] - - [1239, 10006.3] + - [1242, 10006.3] - - [1024, 15232, 1, 1024] - - [1238, 9912.11] + - [1241, 9912.11] - - [1024, 1888, 1, 1024] - - [1250, 8962.88] + - [1253, 8962.88] - - [1024, 1792, 1, 1024] - - [1254, 8556.72] + - [1257, 8556.72] - - [36548, 1073, 1, 1024] - - [1239, 10161.1] + - [1242, 10161.1] - - [4096, 50, 1, 3072] - - [1266, 5882.06] + - [1269, 5882.06] - - [36548, 15488, 1, 1024] - - [1246, 10437.0] + - [1249, 10437.0] - - [1024, 2464, 1, 1024] - - [1243, 8879.92] + - [1246, 8879.92] - - [1024, 2272, 1, 1024] - - [1240, 8720.25] + - [1243, 8720.25] - - [1024, 13, 1, 1024] - - [1270, 774.516] + - [1273, 774.516] - - [1024, 2432, 1, 1024] - - [1245, 8491.43] + - [1248, 8491.43] - - [36548, 24, 1, 1024] - - [1232, 3564.31] + - [1235, 3564.31] - - [1024, 3936, 1, 1024] - - [1255, 9433.2] + - [1258, 9433.2] - - [36548, 13824, 1, 1024] - - [1239, 10439.7] + - [1242, 10439.7] - - [1024, 2401, 1, 1024] - - [1247, 8869.93] + - [1250, 8869.93] - - [1024, 32, 1, 1024] - - [1262, 1839.61] + - [1265, 1839.61] - - [1024, 2176, 1, 1024] - - [1231, 8544.45] + - [1234, 8544.45] - - [1024, 2240, 1, 1024] - - [1240, 8381.45] + - [1243, 8381.45] - - [1024, 1728, 1, 1024] - - [1228, 8212.23] + - [1231, 8212.23] - - [1024, 128, 1, 1024] - - [1275, 4660.34] + - [1278, 4660.34] - - [1024, 216, 1, 1024] - - [1265, 5777.87] + - [1268, 5777.87] - - [1024, 63, 1, 1024] - - [1273, 3329.65] + - [1276, 3329.65] - - [1024, 86, 1, 1024] - - [1279, 3533.6] + - [1282, 3533.6] - - [1024, 2528, 1, 1024] - - [1235, 8789.15] + - [1238, 8789.15] - - [1024, 2400, 1, 1024] - - [1240, 8939.3] + - [1243, 8939.3] - - [1024, 1440, 1, 1024] - - [1247, 9131.31] + - [1250, 9131.31] - - [1024, 2912, 1, 1024] - - [1240, 9139.93] + - [1243, 9139.93] - - [4096, 35, 1, 2048] - - [1266, 4059.75] + - [1269, 4059.75] - - [4096, 63, 1, 2048] - - [1268, 6946.4] + - [1271, 6946.4] - - [1024, 2880, 1, 1024] - - [1238, 9104.88] + - [1241, 9104.88] - - [1024, 4064, 1, 1024] - - [1257, 9715.1] + - [1260, 9715.1] - - [1024, 4655, 1, 1024] - - [1245, 9033.8] + - [1248, 9033.8] - - [1024, 1088, 1, 1024] - - [1229, 8144.31] + - [1232, 8144.31] - - [36548, 6272, 1, 1024] - - [1246, 10427.3] + - [1249, 10427.3] - - [1024, 1, 1, 13] - - [1259, 0.0] + - [1262, 0.0] - - [768, 512, 1, 768] - - [1283, 5889.04] + - [1286, 5889.04] - - [768, 2048, 1, 3072] - - [1293, 9394.62] + - [1296, 9394.62] - - [768, 32, 1, 768] - - [1305, 1502.74] + - [1308, 1502.74] - - [64, 128, 96, 128] - - [1300, 4973.48] + - [1303, 4973.48] - - [3072, 1024, 1, 768] - - [1294, 9856.07] + - [1297, 9856.07] - - [768, 1024, 1, 3072] - - [1287, 8611.06] + - [1290, 8611.06] - - [768, 512, 1, 3072] - - [1286, 6430.79] + - [1289, 6430.79] - - [768, 64, 1, 768] - - [1307, 2621.44] + - [1310, 2621.44] - - [768, 4096, 1, 3072] - - [1292, 10030.4] + - [1295, 10030.4] - - [768, 2048, 1, 2] - - [1285, 381.763] + - [1288, 381.763] - - [768, 2048, 1, 768] - - [1290, 9754.2] + - [1293, 9754.2] - - [768, 320, 1, 30522] - - [1303, 8529.4] + - [1306, 8529.4] - - [64, 64, 96, 64] - - [1297, 2496.61] + - [1300, 2496.61] - - [768, 640, 1, 30522] - - [1284, 8253.84] + - [1287, 8253.84] - - [768, 1280, 1, 30522] - - [1289, 9572.85] + - [1292, 9572.85] - - [768, 1280, 1, 768] - - [1293, 8713.93] + - [1296, 8713.93] - - [768, 640, 1, 768] - - [1283, 7293.03] + - [1286, 7293.03] - - [768, 32, 1, 2] - - [1295, 11.8154] + - [1298, 11.8154] - - [3072, 2048, 1, 768] - - [1290, 10019.6] + - [1293, 10019.6] - - [768, 4096, 1, 768] - - [1290, 9927.35] + - [1293, 9927.35] - - [3072, 4096, 1, 768] - - [1293, 10150.1] + - [1296, 10150.1] - - [64, 256, 192, 256] - - [1299, 7054.19] + - [1302, 7054.19] - - [768, 8, 1, 768] - - [1306, 340.939] + - [1309, 340.939] - - [64, 128, 384, 128] - - [1298, 6765.01] + - [1301, 6765.01] - - [768, 1024, 1, 768] - - [1288, 8768.58] + - [1291, 8768.58] - - [768, 320, 1, 768] - - [1304, 6838.54] + - [1307, 6838.54] - - [64, 64, 768, 64] - - [1301, 5388.83] + - [1304, 5388.83] - - [768, 1024, 1, 2] - - [1281, 258.695] + - [1284, 258.695] - - [768, 16, 1, 768] - - [1306, 819.2] + - [1309, 819.2] - - [64, 256, 96, 256] - - [1299, 5893.64] + - [1302, 5893.64] - - [3072, 512, 1, 768] - - [1291, 9722.79] + - [1294, 9722.79] - - [768, 160, 1, 768] - - [1308, 5019.78] + - [1311, 5019.78] - - [768, 4096, 1, 2] - - [1282, 507.375] + - [1285, 507.375] - - [1600, 512, 1, 1024] - - [1312, 7186.95] + - [1315, 7186.95] - - [1024, 512, 1, 64] - - [1310, 2557.5] + - [1313, 2557.5] - - [1024, 512, 1, 1] - - [1309, 71.2348] + - [1312, 71.2348] - - [2048, 512, 1, 1] - - [1311, 90.3945] + - [1314, 90.3945] - - [1024, 200, 1, 1] - - [1317, 40.0] + - [1320, 40.0] - - [32, 200, 1, 1] - - [1313, 1.56863] + - [1316, 1.56863] - - [560, 200, 1, 1024] - - [1321, 4731.35] + - [1324, 4731.35] - - [1, 512, 1, 1] - - [1320, 0.130612] + - [1323, 0.130612] - - [64, 512, 1, 1] - - [1315, 7.58519] + - [1318, 7.58519] - - [1024, 8192, 1, 256] - - [1330, 9518.99] + - [1333, 9518.99] - - [1024, 22016, 1, 256] - - [1336, 9881.12] + - [1339, 9881.12] - - [256, 8976, 1, 4352] - - [1328, 9567.08] + - [1331, 9567.08] - - [512, 256, 1, 2048] - - [1341, 5917.89] + - [1344, 5917.89] - - [1024, 19968, 1, 256] - - [1336, 9882.37] + - [1339, 9882.37] - - [256, 8976, 1, 1536] - - [1326, 8437.35] + - [1329, 8437.35] - - [256, 8976, 1, 33536] - - [1326, 8441.89] + - [1329, 8441.89] - - [1024, 1792, 1, 256] - - [1326, 7756.97] + - [1329, 7756.97] - - [1024, 21504, 1, 256] - - [1336, 9893.9] + - [1339, 9893.9] - - [512, 215, 1, 2048] - - [1342, 4665.64] + - [1345, 4665.64] - - [1024, 7168, 1, 256] - - [1330, 9509.35] + - [1333, 9509.35] - - [256, 8976, 1, 15872] - - [1332, 8914.65] + - [1335, 8914.65] - - [1024, 19712, 1, 256] - - [1336, 9771.9] + - [1339, 9771.9] - - [256, 8976, 1, 5632] - - [1332, 8740.03] + - [1335, 8740.03] - - [1024, 14848, 1, 256] - - [1336, 9756.15] + - [1339, 9756.15] - - [1024, 28672, 1, 256] - - [1336, 9958.92] + - [1339, 9958.92] - - [256, 8976, 1, 9728] - - [1339, 8853.04] + - [1342, 8853.04] - - [1024, 17152, 1, 256] - - [1330, 9737.3] + - [1333, 9737.3] - - [256, 8976, 1, 11520] - - [1332, 8999.2] + - [1335, 8999.2] - - [256, 8976, 1, 8192] - - [1322, 7897.32] + - [1325, 7897.32] - - [1024, 3328, 1, 256] - - [1337, 8593.53] + - [1340, 8593.53] - - [256, 8976, 1, 7424] - - [1332, 8980.47] + - [1335, 8980.47] - - [1024, 18944, 1, 256] - - [1336, 9854.85] + - [1339, 9854.85] - - [1024, 10496, 1, 256] - - [1331, 9453.9] + - [1334, 9453.9] - - [256, 8976, 1, 5376] - - [1329, 9608.37] + - [1332, 9608.37] - - [256, 8976, 1, 6144] - - [1326, 7880.13] + - [1329, 7880.13] - - [1024, 40448, 1, 256] - - [1336, 10016.6] + - [1339, 10016.6] - - [256, 8976, 1, 22016] - - [1339, 8939.87] + - [1342, 8939.87] - - [256, 8976, 1, 4864] - - [1327, 9211.43] + - [1330, 9211.43] - - [256, 8976, 1, 12288] - - [1323, 8065.05] + - [1326, 8065.05] - - [1024, 9728, 1, 256] - - [1336, 9636.25] + - [1339, 9636.25] - - [256, 8976, 1, 2048] - - [1324, 7001.33] + - [1327, 7001.33] - - [1024, 10240, 1, 256] - - [1330, 9619.96] + - [1333, 9619.96] - - [256, 8976, 1, 2304] - - [1328, 9509.74] + - [1331, 9509.74] - - [1024, 7936, 1, 256] - - [1336, 9300.67] + - [1339, 9300.67] - - [768, 256, 1, 2048] - - [1340, 6267.95] + - [1343, 6267.95] - - [1024, 9984, 1, 256] - - [1336, 9477.28] + - [1339, 9477.28] - - [1024, 13312, 1, 256] - - [1336, 9758.56] + - [1339, 9758.56] - - [1024, 16128, 1, 256] - - [1330, 9721.9] + - [1333, 9721.9] - - [1024, 8960, 1, 256] - - [1331, 9398.25] + - [1334, 9398.25] - - [1024, 5120, 1, 256] - - [1337, 9315.5] + - [1340, 9315.5] - - [1024, 11264, 1, 256] - - [1330, 9664.8] + - [1333, 9664.8] - - [256, 8976, 1, 20480] - - [1338, 8279.87] + - [1341, 8279.87] - - [1024, 20992, 1, 256] - - [1330, 9878.87] + - [1333, 9878.87] - - [256, 8976, 1, 9472] - - [1332, 8990.96] + - [1335, 8990.96] - - [256, 8976, 1, 8448] - - [1332, 8983.52] + - [1335, 8983.52] - - [256, 8976, 1, 20992] - - [1333, 8942.11] + - [1336, 8942.11] - - [256, 8976, 1, 10496] - - [1333, 8989.71] + - [1336, 8989.71] - - [1024, 15104, 1, 256] - - [1331, 9676.01] + - [1334, 9676.01] - - [1024, 6400, 1, 256] - - [1339, 9145.89] + - [1342, 9145.89] - - [1024, 4096, 1, 256] - - [1332, 9124.25] + - [1335, 9124.25] - - [256, 8976, 1, 2560] - - [1326, 8566.11] + - [1329, 8566.11] - - [256, 8976, 1, 2816] - - [1328, 9496.84] + - [1331, 9496.84] - - [1024, 7680, 1, 256] - - [1336, 9460.84] + - [1339, 9460.84] - - [256, 8976, 1, 14336] - - [1333, 8226.8] + - [1336, 8226.8] - - [256, 8976, 1, 6656] - - [1333, 8771.42] + - [1336, 8771.42] - - [1024, 3072, 1, 256] - - [1333, 9076.94] + - [1336, 9076.94] - - [256, 8976, 1, 5888] - - [1329, 9546.3] + - [1332, 9546.3] - - [1024, 12288, 1, 256] - - [1330, 9690.81] + - [1333, 9690.81] - - [256, 8976, 1, 26112] - - [1335, 8699.83] + - [1338, 8699.83] - - [1024, 7424, 1, 256] - - [1337, 9256.84] + - [1340, 9256.84] - - [256, 8976, 1, 14848] - - [1338, 8885.79] + - [1341, 8885.79] - - [768, 215, 1, 2048] - - [1340, 5628.59] + - [1343, 5628.59] - - [1024, 2560, 1, 256] - - [1333, 8820.83] + - [1336, 8820.83] - - [256, 8976, 1, 19968] - - [1332, 8928.86] + - [1335, 8928.86] - - [256, 8976, 1, 9984] - - [1332, 8993.12] + - [1335, 8993.12] - - [1024, 4864, 1, 256] - - [1333, 8974.3] + - [1336, 8974.3] - - [1024, 33536, 1, 256] - - [1336, 9943.07] + - [1339, 9943.07] - - [256, 8976, 1, 15104] - - [1333, 8996.63] + - [1336, 8996.63] - - [1024, 2048, 1, 256] - - [1331, 8462.66] + - [1334, 8462.66] - - [256, 8976, 1, 8960] - - [1333, 8998.92] + - [1336, 8998.92] - - [1024, 6144, 1, 256] - - [1338, 9359.67] + - [1341, 9359.67] - - [1024, 14592, 1, 256] - - [1336, 9667.42] + - [1339, 9667.42] - - [256, 8976, 1, 19712] - - [1332, 9020.11] + - [1335, 9020.11] - - [1024, 11520, 1, 256] - - [1331, 9527.7] + - [1334, 9527.7] - - [1024, 5632, 1, 256] - - [1330, 9297.2] + - [1333, 9297.2] - - [256, 8976, 1, 11008] - - [1339, 8994.8] + - [1342, 8994.8] - - [256, 8976, 1, 17152] - - [1333, 9003.8] + - [1336, 9003.8] - - [256, 8976, 1, 3072] - - [1322, 8261.96] + - [1325, 8261.96] - - [1024, 3840, 1, 256] - - [1339, 8671.89] + - [1342, 8671.89] - - [1024, 14336, 1, 256] - - [1336, 9760.28] + - [1339, 9760.28] - - [1024, 20480, 1, 256] - - [1330, 9887.85] + - [1333, 9887.85] - - [1024, 23552, 1, 256] - - [1330, 9890.46] + - [1333, 9890.46] - - [256, 8976, 1, 7168] - - [1325, 8478.34] + - [1328, 8478.34] - - [1024, 13568, 1, 256] - - [1330, 9654.64] + - [1333, 9654.64] - - [1024, 4608, 1, 256] - - [1338, 9218.25] + - [1341, 9218.25] - - [256, 8976, 1, 10240] - - [1323, 8076.16] + - [1326, 8076.16] - - [1024, 8704, 1, 256] - - [1332, 9475.5] + - [1335, 9475.5] - - [1024, 11008, 1, 256] - - [1336, 9524.96] + - [1339, 9524.96] - - [1024, 8448, 1, 256] - - [1330, 9352.16] + - [1333, 9352.16] - - [256, 8976, 1, 44505] - - [1334, 8430.23] + - [1337, 8430.23] - - [6272, 256, 1, 528] - - [1386, 7389.94] + - [1389, 7389.94] - - [3136, 2048, 1, 1024] - - [1367, 9657.94] + - [1370, 9657.94] - - [6272, 112, 1, 512] - - [1365, 5931.09] + - [1368, 5931.09] - - [2048, 320, 1, 1280] - - [1385, 7772.99] + - [1388, 7772.99] - - [289, 256, 1, 1568] - - [1406, 3718.17] + - [1409, 3718.17] - - [50176, 128, 1, 256] - - [1368, 8908.58] + - [1371, 8908.58] - - [5329, 64, 1, 448] - - [1351, 4602.2] + - [1354, 4602.2] - - [289, 192, 1, 1344] - - [1403, 3452.59] + - [1406, 3452.59] - - [12544, 1024, 1, 256] - - [1368, 9742.64] + - [1371, 9742.64] - - [784, 64, 32, 192] - - [1344, 6844.61] + - [1347, 6844.61] - - [6272, 64, 1, 480] - - [1352, 5562.24] + - [1355, 5562.24] - - [196, 128, 1, 800] - - [1394, 1639.74] + - [1397, 1639.74] - - [64, 512, 1, 1344] - - [1393, 2313.04] + - [1396, 2313.04] - - [6272, 64, 1, 512] - - [1351, 5609.19] + - [1354, 5609.19] - - [6272, 160, 1, 528] - - [1352, 6149.7] + - [1355, 6149.7] - - [289, 160, 32, 768] - - [1379, 6637.82] + - [1382, 6637.82] - - [12544, 256, 1, 1024] - - [1386, 8790.46] + - [1389, 8790.46] - - [289, 224, 1, 1568] - - [1406, 3270.17] + - [1409, 3270.17] - - [5329, 64, 32, 160] - - [1359, 9091.04] + - [1362, 9091.04] - - [5329, 96, 1, 576] - - [1386, 5555.66] + - [1389, 5555.66] - - [3025, 64, 1, 363] - - [1404, 4392.3] + - [1407, 4392.3] - - [784, 32, 32, 192] - - [1375, 5633.8] + - [1378, 5633.8] - - [3136, 512, 1, 1024] - - [1371, 7553.14] + - [1374, 7553.14] - - [6272, 16, 1, 480] - - [1406, 3219.85] + - [1409, 3219.85] - - [1225, 64, 32, 288] - - [1366, 8240.58] + - [1369, 8240.58] - - [64, 256, 1, 1536] - - [1399, 1456.36] + - [1402, 1456.36] - - [289, 192, 32, 768] - - [1378, 7372.8] + - [1381, 7372.8] - - [2048, 448, 1, 1280] - - [1361, 8403.01] + - [1364, 8403.01] - - [3136, 2048, 1, 512] - - [1360, 9486.31] + - [1363, 9486.31] - - [289, 256, 1, 2016] - - [1406, 3876.08] + - [1409, 3876.08] - - [289, 384, 32, 1024] - - [1345, 7350.54] + - [1348, 7350.54] - - [1568, 32, 1, 832] - - [1395, 2717.87] + - [1398, 2717.87] - - [3136, 64, 32, 64] - - [1348, 7657.26] + - [1351, 7657.26] - - [289, 160, 1, 1120] - - [1402, 2826.9] + - [1405, 2826.9] - - [6272, 128, 1, 528] - - [1356, 6926.26] + - [1359, 6926.26] - - [21609, 32, 1, 288] - - [1357, 3698.9] + - [1360, 3698.9] - - [1225, 192, 1, 1728] - - [1390, 7309.81] + - [1393, 7309.81] - - [4096, 512, 1, 4096] - - [1373, 10272.1] + - [1376, 10272.1] - - [64, 256, 1, 1152] - - [1399, 1387.82] + - [1402, 1387.82] - - [6272, 96, 1, 480] - - [1387, 6371.56] + - [1390, 6371.56] - - [784, 96, 1, 800] - - [1407, 3330.27] + - [1410, 3330.27] - - [2048, 448, 1, 2048] - - [1361, 8622.65] + - [1364, 8622.65] - - [784, 96, 32, 192] - - [1376, 7092.36] + - [1379, 7092.36] - - [289, 224, 1, 1344] - - [1406, 3180.01] + - [1409, 3180.01] - - [1001, 512, 1, 4096] - - [1347, 8195.07] + - [1350, 8195.07] - - [2048, 192, 1, 1280] - - [1352, 6120.09] + - [1355, 6120.09] - - [1225, 64, 32, 256] - - [1357, 8076.62] + - [1360, 8076.62] - - [2048, 256, 1, 1536] - - [1347, 8137.7] + - [1350, 8137.7] - - [1225, 64, 1, 1200] - - [1406, 3552.87] + - [1409, 3552.87] - - [6272, 128, 1, 512] - - [1360, 6878.21] + - [1363, 6878.21] - - [729, 192, 1, 1600] - - [1405, 5016.77] + - [1408, 5016.77] - - [289, 192, 1, 896] - - [1403, 3091.87] + - [1406, 3091.87] - - [1568, 384, 1, 832] - - [1386, 6934.62] + - [1389, 6934.62] - - [784, 16, 32, 192] - - [1377, 3380.28] + - [1380, 3380.28] - - [1568, 256, 1, 832] - - [1351, 5980.86] + - [1354, 5980.86] - - [1568, 48, 1, 832] - - [1408, 3275.09] + - [1411, 3275.09] - - [1568, 192, 1, 832] - - [1346, 4441.11] + - [1349, 4441.11] - - [289, 192, 32, 1024] - - [1349, 6563.06] + - [1352, 6563.06] - - [6272, 32, 1, 528] - - [1390, 4998.67] + - [1393, 4998.67] - - [49, 128, 1, 1200] - - [1391, 550.175] + - [1394, 550.175] - - [1225, 64, 32, 384] - - [1363, 8589.33] + - [1366, 8589.33] - - [289, 128, 1, 896] - - [1402, 2103.1] + - [1405, 2103.1] - - [1568, 160, 1, 832] - - [1390, 6995.05] + - [1393, 6995.05] - - [1001, 32, 1, 1024] - - [1399, 1744.72] + - [1402, 1744.72] - - [2048, 320, 1, 2048] - - [1384, 7118.04] + - [1387, 7118.04] - - [2048, 384, 1, 1536] - - [1347, 8184.01] + - [1350, 8184.01] - - [50176, 512, 1, 256] - - [1359, 9852.4] + - [1362, 9852.4] - - [289, 256, 1, 1792] - - [1408, 3809.75] + - [1411, 3809.75] - - [64, 448, 1, 1152] - - [1400, 2128.23] + - [1403, 2128.23] - - [5041, 96, 1, 576] - - [1385, 5279.3] + - [1388, 5279.3] - - [6272, 192, 1, 480] - - [1347, 7479.65] + - [1350, 7479.65] - - [784, 32, 32, 256] - - [1374, 5708.91] + - [1377, 5708.91] - - [1001, 32, 1, 2048] - - [1401, 2141.04] + - [1404, 2141.04] - - [289, 192, 1, 1120] - - [1397, 3277.77] + - [1400, 3277.77] - - [6272, 32, 1, 512] - - [1389, 4978.7] + - [1392, 4978.7] - - [289, 384, 1, 3456] - - [1406, 5904.14] + - [1409, 5904.14] - - [289, 384, 1, 2592] - - [1407, 5707.34] + - [1410, 5707.34] - - [12544, 1024, 1, 512] - - [1368, 10008.3] + - [1371, 10008.3] - - [12544, 256, 1, 512] - - [1386, 8628.08] + - [1389, 8628.08] - - [6272, 24, 1, 512] - - [1390, 3568.07] + - [1393, 3568.07] - - [5041, 192, 1, 720] - - [1361, 8424.42] + - [1364, 8424.42] - - [64, 320, 1, 1728] - - [1394, 1469.66] + - [1397, 1469.66] - - [784, 128, 32, 256] - - [1362, 8104.14] + - [1365, 8104.14] - - [289, 96, 1, 864] - - [1400, 1838.25] + - [1403, 1838.25] - - [1225, 32, 32, 192] - - [1381, 5949.72] + - [1384, 5949.72] - - [1568, 128, 1, 832] - - [1389, 5718.69] + - [1392, 5718.69] - - [289, 128, 32, 768] - - [1347, 7289.25] + - [1350, 7289.25] - - [196, 64, 1, 800] - - [1393, 915.62] + - [1396, 915.62] - - [4096, 512, 1, 9216] - - [1370, 10351.4] + - [1373, 10351.4] - - [12544, 64, 1, 147] - - [1360, 5069.33] + - [1363, 5069.33] - - [784, 32, 1, 400] - - [1391, 1140.36] + - [1394, 1140.36] - - [6272, 160, 1, 512] - - [1351, 6140.08] + - [1354, 6140.08] - - [1225, 48, 32, 288] - - [1357, 5978.61] + - [1360, 5978.61] - - [64, 320, 1, 2880] - - [1398, 1920.0] + - [1401, 1920.0] - - [1225, 64, 32, 192] - - [1351, 7641.01] + - [1354, 7641.01] - - [1001, 32, 1, 1536] - - [1399, 2084.79] + - [1402, 2084.79] - - [784, 64, 32, 256] - - [1343, 6990.51] + - [1346, 6990.51] - - [64, 384, 1, 1152] - - [1400, 1862.6] + - [1403, 1862.6] - - [3136, 512, 1, 2048] - - [1372, 7764.3] + - [1375, 7764.3] - - [6272, 144, 1, 512] - - [1347, 5574.04] + - [1350, 5574.04] - - [1225, 192, 32, 384] - - [1361, 9373.83] + - [1364, 9373.83] - - [64, 192, 1, 1728] - - [1399, 1206.46] + - [1402, 1206.46] - - [8192, 320, 1, 1280] - - [1413, 9875.92] + - [1416, 9875.92] - - [8192, 320, 1, 2048] - - [1416, 9745.7] + - [1419, 9745.7] - - [8192, 384, 1, 1280] - - [1413, 10046.2] + - [1416, 10046.2] - - [8192, 192, 1, 1280] - - [1416, 9950.9] + - [1419, 9950.9] - - [8192, 192, 1, 2048] - - [1412, 9559.67] + - [1415, 9559.67] - - [8192, 384, 1, 2048] - - [1414, 9945.74] + - [1417, 9945.74] - - [8192, 448, 1, 2048] - - [1415, 9908.51] + - [1418, 9908.51] - - [1001, 64, 1, 1536] - - [1409, 3649.94] + - [1412, 3649.94] - - [8192, 448, 1, 1280] - - [1413, 9981.35] + - [1416, 9981.35] - - [1001, 64, 1, 2048] - - [1410, 3580.87] + - [1413, 3580.87] - - [1001, 128, 1, 2048] - - [1411, 5587.87] + - [1414, 5587.87] - - [3200, 1024, 1, 2048] - - [1419, 9131.95] + - [1422, 9131.95] - - [2048, 1024, 1, 256] - - [1418, 8452.0] + - [1421, 8452.0] - - [257, 1024, 1, 4096] - - [1417, 4225.21] + - [1420, 4225.21] - - [3136, 64, 64, 64] - - [1420, 8028.16] + - [1423, 8028.16] - - [1225, 32, 64, 192] - - [1426, 6968.89] + - [1429, 6968.89] - - [3136, 64, 64, 256] - - [1421, 9678.4] + - [1424, 9678.4] - - [3136, 256, 64, 64] - - [1422, 8998.29] + - [1425, 8998.29] - - [1225, 64, 64, 288] - - [1425, 8893.83] + - [1428, 8893.83] - - [289, 128, 64, 768] - - [1423, 8442.75] + - [1426, 8442.75] - - [5329, 80, 64, 64] - - [1427, 6687.37] + - [1430, 6687.37] - - [1225, 64, 64, 192] - - [1424, 8339.5] + - [1427, 8339.5] - - [1225, 64, 64, 256] - - [1428, 8721.52] + - [1431, 8721.52] + - - [65, 6400, 1, 1024] + - [1432, 2839.89] + - - [256, 6400, 1, 4096] + - [1433, 7361.66] + - - [1024, 64, 1, 4096] + - [1434, 3787.18] - null diff --git a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml index b02f82523..c1422a670 100644 --- a/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_ci/vega20_Cijk_Alik_Bljk_SB.yaml @@ -96477,6 +96477,539 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL1_GRVW2_GSU1_PGR0_PLR1_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96601,7 +97134,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 590 + SolutionIndex: 593 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -96750,7 +97283,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 591 + SolutionIndex: 594 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -96895,7 +97428,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 592 + SolutionIndex: 595 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -97044,7 +97577,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 593 + SolutionIndex: 596 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -97193,7 +97726,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 594 + SolutionIndex: 597 SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 SubGroup0: 12 SubGroup1: 16 @@ -97342,7 +97875,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 595 + SolutionIndex: 598 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -97491,7 +98024,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 596 + SolutionIndex: 599 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -97640,7 +98173,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 597 + SolutionIndex: 600 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -97789,7 +98322,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 598 + SolutionIndex: 601 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -97938,7 +98471,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 599 + SolutionIndex: 602 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98087,7 +98620,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 600 + SolutionIndex: 603 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98236,7 +98769,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 601 + SolutionIndex: 604 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98385,7 +98918,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 602 + SolutionIndex: 605 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -98534,7 +99067,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 603 + SolutionIndex: 606 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98683,7 +99216,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 604 + SolutionIndex: 607 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98832,7 +99365,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 605 + SolutionIndex: 608 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -98981,7 +99514,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 606 + SolutionIndex: 609 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -99130,7 +99663,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 607 + SolutionIndex: 610 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -99279,7 +99812,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 608 + SolutionIndex: 611 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99428,7 +99961,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 609 + SolutionIndex: 612 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99577,7 +100110,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 610 + SolutionIndex: 613 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99726,7 +100259,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 611 + SolutionIndex: 614 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -99875,7 +100408,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 612 + SolutionIndex: 615 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100024,7 +100557,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 613 + SolutionIndex: 616 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100173,7 +100706,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 614 + SolutionIndex: 617 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -100322,7 +100855,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 615 + SolutionIndex: 618 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -100471,7 +101004,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 616 + SolutionIndex: 619 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100620,7 +101153,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 617 + SolutionIndex: 620 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100769,7 +101302,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 618 + SolutionIndex: 621 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -100918,7 +101451,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 619 + SolutionIndex: 622 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101067,7 +101600,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 620 + SolutionIndex: 623 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101216,7 +101749,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 621 + SolutionIndex: 624 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101365,7 +101898,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 622 + SolutionIndex: 625 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101514,7 +102047,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 623 + SolutionIndex: 626 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101663,7 +102196,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 624 + SolutionIndex: 627 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101812,7 +102345,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 625 + SolutionIndex: 628 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101961,7 +102494,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 626 + SolutionIndex: 629 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102110,7 +102643,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 627 + SolutionIndex: 630 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102259,7 +102792,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 628 + SolutionIndex: 631 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102408,7 +102941,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 629 + SolutionIndex: 632 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102557,7 +103090,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 630 + SolutionIndex: 633 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102706,7 +103239,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 631 + SolutionIndex: 634 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102855,7 +103388,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 632 + SolutionIndex: 635 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -103004,7 +103537,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 633 + SolutionIndex: 636 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -103153,7 +103686,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 634 + SolutionIndex: 637 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -103302,7 +103835,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 + SolutionIndex: 638 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -103451,7 +103984,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 636 + SolutionIndex: 639 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -103600,7 +104133,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 637 + SolutionIndex: 640 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -103749,7 +104282,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 638 + SolutionIndex: 641 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -103898,7 +104431,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 639 + SolutionIndex: 642 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104047,7 +104580,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 640 + SolutionIndex: 643 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104196,7 +104729,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 641 + SolutionIndex: 644 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -104345,7 +104878,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 642 + SolutionIndex: 645 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104494,7 +105027,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 643 + SolutionIndex: 646 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104643,7 +105176,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 644 + SolutionIndex: 647 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104792,7 +105325,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 645 + SolutionIndex: 648 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104941,7 +105474,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 646 + SolutionIndex: 649 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105090,7 +105623,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 647 + SolutionIndex: 650 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105239,7 +105772,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 648 + SolutionIndex: 651 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105388,7 +105921,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 + SolutionIndex: 652 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105537,7 +106070,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 + SolutionIndex: 653 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105686,7 +106219,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 + SolutionIndex: 654 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105835,7 +106368,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 + SolutionIndex: 655 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -105984,7 +106517,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 + SolutionIndex: 656 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106133,7 +106666,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 + SolutionIndex: 657 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106282,7 +106815,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 + SolutionIndex: 658 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106431,7 +106964,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 + SolutionIndex: 659 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106580,7 +107113,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 + SolutionIndex: 660 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -106729,7 +107262,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 + SolutionIndex: 661 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -106878,7 +107411,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 + SolutionIndex: 662 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -107027,7 +107560,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 + SolutionIndex: 663 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107176,7 +107709,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 + SolutionIndex: 664 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107325,7 +107858,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 + SolutionIndex: 665 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107474,7 +108007,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 + SolutionIndex: 666 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -107623,7 +108156,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 + SolutionIndex: 667 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -107772,7 +108305,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 + SolutionIndex: 668 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -107921,7 +108454,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 + SolutionIndex: 669 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108070,7 +108603,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 + SolutionIndex: 670 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108219,7 +108752,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 + SolutionIndex: 671 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108368,7 +108901,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 + SolutionIndex: 672 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -108517,7 +109050,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 + SolutionIndex: 673 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108666,7 +109199,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 + SolutionIndex: 674 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108815,7 +109348,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 + SolutionIndex: 675 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108964,7 +109497,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 + SolutionIndex: 676 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109113,7 +109646,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 + SolutionIndex: 677 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109262,7 +109795,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 + SolutionIndex: 678 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109411,7 +109944,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 + SolutionIndex: 679 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109560,7 +110093,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 + SolutionIndex: 680 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109709,7 +110242,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 + SolutionIndex: 681 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109858,7 +110391,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 + SolutionIndex: 682 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110007,7 +110540,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 + SolutionIndex: 683 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110156,7 +110689,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 + SolutionIndex: 684 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110305,7 +110838,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 682 + SolutionIndex: 685 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110454,7 +110987,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 + SolutionIndex: 686 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110603,7 +111136,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 + SolutionIndex: 687 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110752,7 +111285,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 + SolutionIndex: 688 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110901,7 +111434,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 + SolutionIndex: 689 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111050,7 +111583,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 + SolutionIndex: 690 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111199,7 +111732,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 + SolutionIndex: 691 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111348,7 +111881,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 + SolutionIndex: 692 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111497,7 +112030,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 + SolutionIndex: 693 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111646,7 +112179,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 + SolutionIndex: 694 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111795,7 +112328,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 + SolutionIndex: 695 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111944,7 +112477,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 + SolutionIndex: 696 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112093,7 +112626,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 + SolutionIndex: 697 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112242,7 +112775,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 + SolutionIndex: 698 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112391,7 +112924,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 + SolutionIndex: 699 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112540,7 +113073,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 + SolutionIndex: 700 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112689,7 +113222,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 + SolutionIndex: 701 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112838,7 +113371,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 + SolutionIndex: 702 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112987,7 +113520,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 + SolutionIndex: 703 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -113136,7 +113669,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 + SolutionIndex: 704 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -113285,7 +113818,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 + SolutionIndex: 705 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -113434,7 +113967,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 + SolutionIndex: 706 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -113583,7 +114116,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 + SolutionIndex: 707 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -113732,7 +114265,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 + SolutionIndex: 708 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -113881,7 +114414,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 + SolutionIndex: 709 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -114030,7 +114563,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 + SolutionIndex: 710 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -114179,7 +114712,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 + SolutionIndex: 711 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -114324,7 +114857,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 + SolutionIndex: 712 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -114470,7 +115003,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 + SolutionIndex: 713 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -114616,7 +115149,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 + SolutionIndex: 714 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -114762,7 +115295,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 + SolutionIndex: 715 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 SubGroup0: 16 SubGroup1: 4 @@ -114908,7 +115441,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 + SolutionIndex: 716 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -115054,7 +115587,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 + SolutionIndex: 717 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -115200,7 +115733,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 + SolutionIndex: 718 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -115346,7 +115879,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 + SolutionIndex: 719 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -115492,7 +116025,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 + SolutionIndex: 720 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -115649,7 +116182,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 + SolutionIndex: 721 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -115811,7 +116344,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 + SolutionIndex: 722 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -115973,7 +116506,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 + SolutionIndex: 723 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116135,7 +116668,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 + SolutionIndex: 724 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116297,7 +116830,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 + SolutionIndex: 725 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116459,7 +116992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 + SolutionIndex: 726 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116621,7 +117154,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 + SolutionIndex: 727 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -116783,7 +117316,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 + SolutionIndex: 728 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -116945,7 +117478,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 + SolutionIndex: 729 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -117107,7 +117640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 + SolutionIndex: 730 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -117269,7 +117802,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 + SolutionIndex: 731 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117431,7 +117964,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 + SolutionIndex: 732 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117593,7 +118126,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 + SolutionIndex: 733 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117755,7 +118288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 + SolutionIndex: 734 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -117913,7 +118446,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 + SolutionIndex: 735 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -118075,7 +118608,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 + SolutionIndex: 736 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118237,7 +118770,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 + SolutionIndex: 737 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118399,7 +118932,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 + SolutionIndex: 738 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118561,7 +119094,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 + SolutionIndex: 739 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -118723,7 +119256,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 + SolutionIndex: 740 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -118885,7 +119418,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 + SolutionIndex: 741 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -119047,7 +119580,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 + SolutionIndex: 742 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -119205,7 +119738,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 + SolutionIndex: 743 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119363,7 +119896,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 + SolutionIndex: 744 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119525,7 +120058,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 + SolutionIndex: 745 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119687,7 +120220,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 743 + SolutionIndex: 746 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119849,7 +120382,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 + SolutionIndex: 747 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -120011,7 +120544,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 + SolutionIndex: 748 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -120173,7 +120706,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 + SolutionIndex: 749 SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -120335,7 +120868,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 + SolutionIndex: 750 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120497,7 +121030,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 + SolutionIndex: 751 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -120659,7 +121192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 + SolutionIndex: 752 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120821,7 +121354,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 + SolutionIndex: 753 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120979,7 +121512,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 + SolutionIndex: 754 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -121141,7 +121674,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 + SolutionIndex: 755 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -121303,7 +121836,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 + SolutionIndex: 756 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121461,7 +121994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 + SolutionIndex: 757 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121623,7 +122156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 + SolutionIndex: 758 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121785,7 +122318,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 + SolutionIndex: 759 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121947,7 +122480,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 + SolutionIndex: 760 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -122109,7 +122642,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 + SolutionIndex: 761 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -122271,7 +122804,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 + SolutionIndex: 762 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -122433,7 +122966,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 + SolutionIndex: 763 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -122595,7 +123128,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 + SolutionIndex: 764 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -122757,7 +123290,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 + SolutionIndex: 765 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -122919,7 +123452,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 + SolutionIndex: 766 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123077,7 +123610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 + SolutionIndex: 767 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123239,7 +123772,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 + SolutionIndex: 768 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123401,7 +123934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 + SolutionIndex: 769 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123563,7 +124096,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 + SolutionIndex: 770 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -123725,7 +124258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 + SolutionIndex: 771 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -123887,7 +124420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 + SolutionIndex: 772 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124045,7 +124578,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 + SolutionIndex: 773 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124207,7 +124740,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 + SolutionIndex: 774 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124369,7 +124902,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 + SolutionIndex: 775 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124531,7 +125064,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 + SolutionIndex: 776 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -124693,7 +125226,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 + SolutionIndex: 777 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -124855,7 +125388,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 + SolutionIndex: 778 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125017,7 +125550,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 + SolutionIndex: 779 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125179,7 +125712,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 + SolutionIndex: 780 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125341,7 +125874,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 + SolutionIndex: 781 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125503,7 +126036,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 + SolutionIndex: 782 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -125665,7 +126198,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 + SolutionIndex: 783 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125827,7 +126360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 + SolutionIndex: 784 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125989,7 +126522,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 + SolutionIndex: 785 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126151,7 +126684,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 + SolutionIndex: 786 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126313,7 +126846,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 + SolutionIndex: 787 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126475,7 +127008,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 + SolutionIndex: 788 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126637,7 +127170,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 + SolutionIndex: 789 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126799,7 +127332,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 + SolutionIndex: 790 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126957,7 +127490,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 + SolutionIndex: 791 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127119,7 +127652,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 + SolutionIndex: 792 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127281,7 +127814,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 + SolutionIndex: 793 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127443,7 +127976,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 + SolutionIndex: 794 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127605,7 +128138,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 + SolutionIndex: 795 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -127767,7 +128300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 + SolutionIndex: 796 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -127929,7 +128462,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 + SolutionIndex: 797 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128091,7 +128624,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 + SolutionIndex: 798 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128253,7 +128786,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 + SolutionIndex: 799 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128415,7 +128948,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 + SolutionIndex: 800 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128577,7 +129110,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 + SolutionIndex: 801 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128739,7 +129272,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 + SolutionIndex: 802 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128901,7 +129434,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 + SolutionIndex: 803 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129063,7 +129596,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 + SolutionIndex: 804 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129225,7 +129758,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 + SolutionIndex: 805 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129387,7 +129920,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 + SolutionIndex: 806 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129549,7 +130082,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 + SolutionIndex: 807 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129711,7 +130244,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 + SolutionIndex: 808 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129873,7 +130406,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 + SolutionIndex: 809 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -130035,7 +130568,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 + SolutionIndex: 810 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130197,7 +130730,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 + SolutionIndex: 811 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -130359,7 +130892,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 + SolutionIndex: 812 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130521,7 +131054,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 + SolutionIndex: 813 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130683,7 +131216,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 + SolutionIndex: 814 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -130845,7 +131378,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 + SolutionIndex: 815 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131007,7 +131540,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 + SolutionIndex: 816 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131169,7 +131702,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 + SolutionIndex: 817 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131331,7 +131864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 + SolutionIndex: 818 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131493,7 +132026,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 + SolutionIndex: 819 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131651,7 +132184,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 + SolutionIndex: 820 SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131813,7 +132346,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 + SolutionIndex: 821 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131975,7 +132508,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 + SolutionIndex: 822 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -132137,7 +132670,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 + SolutionIndex: 823 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132299,7 +132832,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 + SolutionIndex: 824 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132461,7 +132994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 + SolutionIndex: 825 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132623,7 +133156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 + SolutionIndex: 826 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132785,7 +133318,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 + SolutionIndex: 827 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132943,7 +133476,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 + SolutionIndex: 828 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133105,7 +133638,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 + SolutionIndex: 829 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133267,7 +133800,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 + SolutionIndex: 830 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133429,7 +133962,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 + SolutionIndex: 831 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133591,7 +134124,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 + SolutionIndex: 832 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -133753,7 +134286,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 + SolutionIndex: 833 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -133915,7 +134448,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 + SolutionIndex: 834 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134077,7 +134610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 + SolutionIndex: 835 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134239,7 +134772,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 + SolutionIndex: 836 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134401,7 +134934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 + SolutionIndex: 837 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134563,7 +135096,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 + SolutionIndex: 838 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134725,7 +135258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 + SolutionIndex: 839 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134887,7 +135420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 + SolutionIndex: 840 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -135049,7 +135582,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 + SolutionIndex: 841 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -135211,7 +135744,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 + SolutionIndex: 842 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135373,7 +135906,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 + SolutionIndex: 843 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135535,7 +136068,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 + SolutionIndex: 844 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135697,7 +136230,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 + SolutionIndex: 845 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -135859,7 +136392,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 + SolutionIndex: 846 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -136021,7 +136554,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 + SolutionIndex: 847 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -136183,7 +136716,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 + SolutionIndex: 848 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -136347,7 +136880,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 + SolutionIndex: 849 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136511,7 +137044,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 + SolutionIndex: 850 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136675,7 +137208,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 + SolutionIndex: 851 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136839,7 +137372,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 + SolutionIndex: 852 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137003,7 +137536,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 + SolutionIndex: 853 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137167,7 +137700,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 + SolutionIndex: 854 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137331,7 +137864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 + SolutionIndex: 855 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -137495,7 +138028,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 + SolutionIndex: 856 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137659,7 +138192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 + SolutionIndex: 857 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137823,7 +138356,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 + SolutionIndex: 858 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137987,7 +138520,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 + SolutionIndex: 859 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138151,7 +138684,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 + SolutionIndex: 860 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138315,7 +138848,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 + SolutionIndex: 861 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138479,7 +139012,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 + SolutionIndex: 862 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138639,7 +139172,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 + SolutionIndex: 863 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138803,7 +139336,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 + SolutionIndex: 864 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138967,7 +139500,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 + SolutionIndex: 865 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139131,7 +139664,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 + SolutionIndex: 866 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139295,7 +139828,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 + SolutionIndex: 867 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139459,7 +139992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 + SolutionIndex: 868 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -139623,7 +140156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 + SolutionIndex: 869 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -139787,7 +140320,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 + SolutionIndex: 870 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -139951,7 +140484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 + SolutionIndex: 871 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -140115,7 +140648,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 + SolutionIndex: 872 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -140280,7 +140813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 + SolutionIndex: 873 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -140447,7 +140980,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 + SolutionIndex: 874 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140612,7 +141145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 + SolutionIndex: 875 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140775,7 +141308,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 + SolutionIndex: 876 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140940,7 +141473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 + SolutionIndex: 877 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -141107,7 +141640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 + SolutionIndex: 878 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141272,7 +141805,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 + SolutionIndex: 879 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141435,7 +141968,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 + SolutionIndex: 880 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141600,7 +142133,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 + SolutionIndex: 881 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141767,7 +142300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 + SolutionIndex: 882 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -141932,7 +142465,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 + SolutionIndex: 883 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142095,7 +142628,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 + SolutionIndex: 884 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142260,7 +142793,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 + SolutionIndex: 885 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142427,7 +142960,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 + SolutionIndex: 886 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142592,7 +143125,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 + SolutionIndex: 887 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142755,7 +143288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 + SolutionIndex: 888 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142920,7 +143453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 + SolutionIndex: 889 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143085,7 +143618,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 + SolutionIndex: 890 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143252,7 +143785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 + SolutionIndex: 891 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143417,7 +143950,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 + SolutionIndex: 892 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143582,7 +144115,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 + SolutionIndex: 893 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143747,7 +144280,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 + SolutionIndex: 894 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143912,7 +144445,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 + SolutionIndex: 895 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144075,7 +144608,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 + SolutionIndex: 896 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144240,7 +144773,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 + SolutionIndex: 897 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144407,7 +144940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 + SolutionIndex: 898 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144572,7 +145105,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 + SolutionIndex: 899 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144735,7 +145268,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 + SolutionIndex: 900 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144900,7 +145433,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 + SolutionIndex: 901 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145067,7 +145600,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 + SolutionIndex: 902 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145232,7 +145765,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 + SolutionIndex: 903 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145395,7 +145928,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 + SolutionIndex: 904 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145562,7 +146095,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 + SolutionIndex: 905 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -145725,7 +146258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 + SolutionIndex: 906 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -145892,7 +146425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 + SolutionIndex: 907 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146055,7 +146588,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 + SolutionIndex: 908 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146216,7 +146749,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 + SolutionIndex: 909 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146379,7 +146912,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 + SolutionIndex: 910 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146540,7 +147073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 + SolutionIndex: 911 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146701,7 +147234,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 + SolutionIndex: 912 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146860,7 +147393,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 + SolutionIndex: 913 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147023,7 +147556,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 + SolutionIndex: 914 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147182,7 +147715,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 + SolutionIndex: 915 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147345,7 +147878,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 + SolutionIndex: 916 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147504,7 +148037,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 + SolutionIndex: 917 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147667,7 +148200,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 + SolutionIndex: 918 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147826,7 +148359,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 + SolutionIndex: 919 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147989,7 +148522,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 + SolutionIndex: 920 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148148,7 +148681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 + SolutionIndex: 921 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148311,7 +148844,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 + SolutionIndex: 922 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148472,7 +149005,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 + SolutionIndex: 923 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -148631,7 +149164,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 + SolutionIndex: 924 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148794,7 +149327,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 + SolutionIndex: 925 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -148953,7 +149486,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 + SolutionIndex: 926 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -149114,7 +149647,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 + SolutionIndex: 927 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -149275,7 +149808,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 + SolutionIndex: 928 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149442,7 +149975,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 + SolutionIndex: 929 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149611,7 +150144,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 + SolutionIndex: 930 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149778,7 +150311,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 + SolutionIndex: 931 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149943,7 +150476,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 + SolutionIndex: 932 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150110,7 +150643,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 + SolutionIndex: 933 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150280,7 +150813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 + SolutionIndex: 934 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150448,7 +150981,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 + SolutionIndex: 935 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150612,7 +151145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 + SolutionIndex: 936 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150776,7 +151309,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 + SolutionIndex: 937 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150942,7 +151475,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 + SolutionIndex: 938 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151106,7 +151639,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 + SolutionIndex: 939 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151274,7 +151807,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 + SolutionIndex: 940 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151444,7 +151977,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 + SolutionIndex: 941 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151612,7 +152145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 + SolutionIndex: 942 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151780,7 +152313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 + SolutionIndex: 943 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151948,7 +152481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 + SolutionIndex: 944 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152114,7 +152647,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 + SolutionIndex: 945 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152284,7 +152817,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 + SolutionIndex: 946 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152450,7 +152983,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 + SolutionIndex: 947 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152620,7 +153153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 + SolutionIndex: 948 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152788,7 +153321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 + SolutionIndex: 949 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152956,7 +153489,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 + SolutionIndex: 950 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153124,7 +153657,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 + SolutionIndex: 951 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153290,7 +153823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 + SolutionIndex: 952 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153460,7 +153993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 + SolutionIndex: 953 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153626,7 +154159,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 + SolutionIndex: 954 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153794,7 +154327,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 + SolutionIndex: 955 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153967,7 +154500,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 + SolutionIndex: 956 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154138,7 +154671,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 + SolutionIndex: 957 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154309,7 +154842,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 + SolutionIndex: 958 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154480,7 +155013,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 + SolutionIndex: 959 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154649,7 +155182,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 + SolutionIndex: 960 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154820,7 +155353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 + SolutionIndex: 961 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154991,7 +155524,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 + SolutionIndex: 962 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155158,7 +155691,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 + SolutionIndex: 963 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155331,7 +155864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 + SolutionIndex: 964 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155502,7 +156035,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 + SolutionIndex: 965 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155673,7 +156206,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 + SolutionIndex: 966 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155844,7 +156377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 + SolutionIndex: 967 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156015,7 +156548,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 + SolutionIndex: 968 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156186,7 +156719,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 + SolutionIndex: 969 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156355,7 +156888,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 + SolutionIndex: 970 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156526,7 +157059,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 + SolutionIndex: 971 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156697,7 +157230,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 + SolutionIndex: 972 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156868,7 +157401,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 + SolutionIndex: 973 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157039,7 +157572,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 + SolutionIndex: 974 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157212,7 +157745,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 + SolutionIndex: 975 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157383,7 +157916,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 + SolutionIndex: 976 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157554,7 +158087,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 + SolutionIndex: 977 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157725,7 +158258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 + SolutionIndex: 978 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157896,7 +158429,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 + SolutionIndex: 979 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158065,7 +158598,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 + SolutionIndex: 980 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158236,7 +158769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 + SolutionIndex: 981 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158407,7 +158940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 + SolutionIndex: 982 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158578,7 +159111,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 + SolutionIndex: 983 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158749,7 +159282,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 + SolutionIndex: 984 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158920,7 +159453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 + SolutionIndex: 985 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159091,7 +159624,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 + SolutionIndex: 986 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159264,7 +159797,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 + SolutionIndex: 987 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159435,7 +159968,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 + SolutionIndex: 988 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159606,7 +160139,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 + SolutionIndex: 989 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159777,7 +160310,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 + SolutionIndex: 990 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159948,7 +160481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 + SolutionIndex: 991 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160119,7 +160652,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 + SolutionIndex: 992 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160290,7 +160823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 + SolutionIndex: 993 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160459,7 +160992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 + SolutionIndex: 994 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160630,7 +161163,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 + SolutionIndex: 995 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160801,7 +161334,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 + SolutionIndex: 996 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160972,7 +161505,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 + SolutionIndex: 997 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161143,7 +161676,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 + SolutionIndex: 998 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161314,7 +161847,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 + SolutionIndex: 999 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161485,7 +162018,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 + SolutionIndex: 1000 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161656,7 +162189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 + SolutionIndex: 1001 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161827,7 +162360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 + SolutionIndex: 1002 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162000,7 +162533,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 + SolutionIndex: 1003 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162171,7 +162704,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 + SolutionIndex: 1004 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162342,7 +162875,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 + SolutionIndex: 1005 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162514,7 +163047,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 + SolutionIndex: 1006 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162686,7 +163219,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 + SolutionIndex: 1007 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162858,7 +163391,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 + SolutionIndex: 1008 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163030,7 +163563,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 + SolutionIndex: 1009 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163202,7 +163735,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 + SolutionIndex: 1010 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163370,7 +163903,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 + SolutionIndex: 1011 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163542,7 +164075,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 + SolutionIndex: 1012 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163714,7 +164247,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 + SolutionIndex: 1013 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163884,7 +164417,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 + SolutionIndex: 1014 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164056,7 +164589,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 + SolutionIndex: 1015 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164224,7 +164757,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 + SolutionIndex: 1016 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164396,7 +164929,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 + SolutionIndex: 1017 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164568,7 +165101,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 + SolutionIndex: 1018 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164740,7 +165273,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 + SolutionIndex: 1019 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164908,7 +165441,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 + SolutionIndex: 1020 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -165080,7 +165613,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 + SolutionIndex: 1021 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165252,7 +165785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 + SolutionIndex: 1022 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -165426,7 +165959,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 + SolutionIndex: 1023 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165598,7 +166131,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 + SolutionIndex: 1024 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165770,7 +166303,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 + SolutionIndex: 1025 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165942,7 +166475,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 + SolutionIndex: 1026 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166112,7 +166645,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 + SolutionIndex: 1027 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166286,7 +166819,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 + SolutionIndex: 1028 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166458,7 +166991,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 + SolutionIndex: 1029 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166628,7 +167161,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 + SolutionIndex: 1030 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166796,7 +167329,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 + SolutionIndex: 1031 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166968,7 +167501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 + SolutionIndex: 1032 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167140,7 +167673,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 + SolutionIndex: 1033 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167312,7 +167845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 + SolutionIndex: 1034 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167484,7 +168017,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 + SolutionIndex: 1035 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167658,7 +168191,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 + SolutionIndex: 1036 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167830,7 +168363,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 + SolutionIndex: 1037 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168002,7 +168535,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 + SolutionIndex: 1038 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168174,7 +168707,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 + SolutionIndex: 1039 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168344,7 +168877,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 + SolutionIndex: 1040 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168516,7 +169049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 + SolutionIndex: 1041 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168690,7 +169223,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 + SolutionIndex: 1042 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168862,7 +169395,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 + SolutionIndex: 1043 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169034,7 +169567,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 + SolutionIndex: 1044 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169206,7 +169739,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 + SolutionIndex: 1045 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169378,7 +169911,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 + SolutionIndex: 1046 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169548,7 +170081,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 + SolutionIndex: 1047 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169720,7 +170253,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 + SolutionIndex: 1048 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169892,7 +170425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 + SolutionIndex: 1049 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170064,7 +170597,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 + SolutionIndex: 1050 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170236,7 +170769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 + SolutionIndex: 1051 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170408,7 +170941,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 + SolutionIndex: 1052 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170580,7 +171113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 + SolutionIndex: 1053 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170754,7 +171287,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 + SolutionIndex: 1054 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170926,7 +171459,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 + SolutionIndex: 1055 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171096,7 +171629,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 + SolutionIndex: 1056 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171270,7 +171803,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 + SolutionIndex: 1057 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171442,7 +171975,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 + SolutionIndex: 1058 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171614,7 +172147,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 + SolutionIndex: 1059 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171786,7 +172319,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 + SolutionIndex: 1060 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171958,7 +172491,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 + SolutionIndex: 1061 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172130,7 +172663,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 + SolutionIndex: 1062 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172302,7 +172835,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 + SolutionIndex: 1063 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -172474,7 +173007,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 + SolutionIndex: 1064 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172644,7 +173177,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 + SolutionIndex: 1065 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -172818,7 +173351,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 + SolutionIndex: 1066 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172990,7 +173523,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 + SolutionIndex: 1067 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173162,7 +173695,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 + SolutionIndex: 1068 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173334,7 +173867,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 + SolutionIndex: 1069 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173506,7 +174039,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 + SolutionIndex: 1070 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173676,7 +174209,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 + SolutionIndex: 1071 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173850,7 +174383,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 + SolutionIndex: 1072 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174020,7 +174553,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 + SolutionIndex: 1073 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174192,7 +174725,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 + SolutionIndex: 1074 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174366,7 +174899,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 + SolutionIndex: 1075 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174536,7 +175069,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 + SolutionIndex: 1076 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174710,7 +175243,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 + SolutionIndex: 1077 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174882,7 +175415,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 + SolutionIndex: 1078 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175054,7 +175587,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 + SolutionIndex: 1079 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175226,7 +175759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 + SolutionIndex: 1080 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175398,7 +175931,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 + SolutionIndex: 1081 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175570,7 +176103,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 + SolutionIndex: 1082 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175742,7 +176275,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 + SolutionIndex: 1083 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175914,7 +176447,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 + SolutionIndex: 1084 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176086,7 +176619,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 + SolutionIndex: 1085 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176258,7 +176791,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 + SolutionIndex: 1086 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176430,7 +176963,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 + SolutionIndex: 1087 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176602,7 +177135,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 + SolutionIndex: 1088 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176774,7 +177307,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 + SolutionIndex: 1089 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176946,7 +177479,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 + SolutionIndex: 1090 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177118,7 +177651,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 + SolutionIndex: 1091 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177290,7 +177823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 + SolutionIndex: 1092 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177462,7 +177995,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 + SolutionIndex: 1093 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177634,7 +178167,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 + SolutionIndex: 1094 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177806,7 +178339,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 + SolutionIndex: 1095 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177976,7 +178509,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 + SolutionIndex: 1096 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178150,7 +178683,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 + SolutionIndex: 1097 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178322,7 +178855,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 + SolutionIndex: 1098 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178492,7 +179025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 + SolutionIndex: 1099 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178666,7 +179199,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 + SolutionIndex: 1100 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178840,7 +179373,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 + SolutionIndex: 1101 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179014,7 +179547,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 + SolutionIndex: 1102 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -179188,7 +179721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 + SolutionIndex: 1103 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179360,7 +179893,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 + SolutionIndex: 1104 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -179534,7 +180067,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 + SolutionIndex: 1105 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179710,7 +180243,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 + SolutionIndex: 1106 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179880,7 +180413,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 + SolutionIndex: 1107 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180052,7 +180585,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 + SolutionIndex: 1108 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180226,7 +180759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 + SolutionIndex: 1109 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180402,7 +180935,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 + SolutionIndex: 1110 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180576,7 +181109,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 + SolutionIndex: 1111 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180753,7 +181286,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 + SolutionIndex: 1112 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180930,7 +181463,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 + SolutionIndex: 1113 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181105,7 +181638,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 + SolutionIndex: 1114 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181282,7 +181815,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 + SolutionIndex: 1115 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181459,7 +181992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 + SolutionIndex: 1116 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181638,7 +182171,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 + SolutionIndex: 1117 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181813,7 +182346,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 + SolutionIndex: 1118 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181992,7 +182525,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 + SolutionIndex: 1119 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182167,7 +182700,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 + SolutionIndex: 1120 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -182344,7 +182877,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 + SolutionIndex: 1121 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182521,7 +183054,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 + SolutionIndex: 1122 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -182700,7 +183233,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 + SolutionIndex: 1123 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182875,7 +183408,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 + SolutionIndex: 1124 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183052,7 +183585,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 + SolutionIndex: 1125 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183229,7 +183762,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 + SolutionIndex: 1126 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183404,7 +183937,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 + SolutionIndex: 1127 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW1_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183581,7 +184114,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 + SolutionIndex: 1128 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183754,7 +184287,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 + SolutionIndex: 1129 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183929,7 +184462,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 + SolutionIndex: 1130 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_LPA4_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184106,7 +184639,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 + SolutionIndex: 1131 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184283,7 +184816,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 + SolutionIndex: 1132 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184460,7 +184993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 + SolutionIndex: 1133 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184639,7 +185172,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 + SolutionIndex: 1134 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184812,7 +185345,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 + SolutionIndex: 1135 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR0_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184985,7 +185518,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 + SolutionIndex: 1136 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185162,7 +185695,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 + SolutionIndex: 1137 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185337,7 +185870,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 + SolutionIndex: 1138 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185516,7 +186049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 + SolutionIndex: 1139 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185691,7 +186224,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 + SolutionIndex: 1140 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -185870,7 +186403,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 + SolutionIndex: 1141 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186045,7 +186578,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 + SolutionIndex: 1142 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186226,7 +186759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 + SolutionIndex: 1143 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -186401,7 +186934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 + SolutionIndex: 1144 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186580,7 +187113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 + SolutionIndex: 1145 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186755,7 +187288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 + SolutionIndex: 1146 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -186934,7 +187467,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 + SolutionIndex: 1147 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187109,7 +187642,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 + SolutionIndex: 1148 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187282,7 +187815,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 + SolutionIndex: 1149 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187461,7 +187994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 + SolutionIndex: 1150 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187636,7 +188169,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 + SolutionIndex: 1151 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187817,7 +188350,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 + SolutionIndex: 1152 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187996,7 +188529,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 + SolutionIndex: 1153 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -188173,7 +188706,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 + SolutionIndex: 1154 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -188352,7 +188885,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 + SolutionIndex: 1155 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188531,7 +189064,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 + SolutionIndex: 1156 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188712,7 +189245,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 + SolutionIndex: 1157 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188891,7 +189424,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 + SolutionIndex: 1158 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -189066,7 +189599,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 + SolutionIndex: 1159 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189245,7 +189778,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 + SolutionIndex: 1160 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189424,7 +189957,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 + SolutionIndex: 1161 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -189599,7 +190132,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 + SolutionIndex: 1162 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189778,7 +190311,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 + SolutionIndex: 1163 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189951,7 +190484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 + SolutionIndex: 1164 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190126,7 +190659,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 + SolutionIndex: 1165 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -190305,7 +190838,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 + SolutionIndex: 1166 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190484,7 +191017,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 + SolutionIndex: 1167 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -190659,7 +191192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 + SolutionIndex: 1168 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190834,7 +191367,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 + SolutionIndex: 1169 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191015,7 +191548,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 + SolutionIndex: 1170 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191190,7 +191723,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 + SolutionIndex: 1171 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191313,7 +191846,186 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191365,8 +192077,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191409,7 +192121,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -191472,8 +192184,6 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -191544,8 +192254,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191569,6 +192279,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191580,7 +192292,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191588,16 +192300,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191608,22 +192320,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -191636,11 +192348,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -191649,12 +192361,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -191721,20 +192435,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -191742,12 +192456,189 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191780,7 +192671,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -191796,15 +192687,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 3712 LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 576 LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191829,7 +192720,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -191902,8 +192793,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191922,11 +192813,186 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191947,15 +193013,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191968,10 +193034,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -192009,12 +193075,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192081,8 +193147,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192138,7 +193204,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -192154,15 +193220,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 + LdsNumElements: 3168 LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 576 LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192175,9 +193241,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192186,14 +193252,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192260,8 +193326,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM8 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192270,17 +193336,17 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -192305,15 +193371,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192326,14 +193392,14 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 2176 LdsOffsetA: 0 LdsOffsetB: 1088 LdsPadA: 2 @@ -192350,9 +193416,9 @@ LoopTail: true LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192361,13 +193427,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 @@ -192435,8 +193501,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192445,11 +193511,11 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -192471,7 +193537,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192480,43 +193546,43 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 576 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192527,11 +193593,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192540,14 +193606,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192614,8 +193680,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192624,21 +193690,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192657,17 +193723,17 @@ DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192680,22 +193746,18 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192707,10 +193769,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192719,15 +193781,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -192741,7 +193801,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192793,31 +193853,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL1_GRVW2_GSU1_PGR0_PLR1_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192829,24 +193891,24 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192857,20 +193919,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192881,11 +193947,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192894,15 +193960,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -192916,7 +193982,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192968,31 +194034,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193004,7 +194070,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193012,9 +194078,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -193025,45 +194091,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -193074,14 +194140,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -193147,31 +194211,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -199317,6 +200383,8 @@ - [556, 7802.45] - - [256, 224, 9, 9792] - [571, 7100.97] + - - [128, 128, 11, 3264] + - [577, 4828.06] - - [256, 256, 9, 4896] - [569, 6163.1] - - [320, 256, 9, 4896] @@ -199337,6 +200405,8 @@ - [565, 4940.68] - - [128, 128, 9, 9792] - [587, 4094.51] + - - [128, 128, 11, 6528] + - [587, 4780.97] - - [192, 192, 11, 6528] - [552, 6918.07] - - [160, 160, 9, 4896] @@ -199347,6 +200417,8 @@ - [558, 7526.25] - - [224, 192, 11, 6528] - [582, 7333.58] + - - [192, 192, 9, 19584] + - [583, 5859.95] - - [256, 224, 11, 13056] - [556, 6512.15] - - [224, 192, 11, 13056] @@ -199367,6 +200439,8 @@ - [580, 1382.91] - - [224, 192, 11, 3264] - [583, 7336.37] + - - [128, 128, 9, 19584] + - [550, 3631.15] - - [224, 224, 11, 6528] - [572, 5718.39] - - [160, 160, 11, 13056] @@ -199387,6 +200461,8 @@ - [551, 7770.2] - - [320, 256, 11, 13056] - [561, 8806.16] + - - [64, 64, 9, 345728] + - [589, 1386.57] - - [128, 128, 9, 4896] - [587, 4041.34] - - [256, 256, 9, 9792] @@ -199395,6 +200471,8 @@ - [564, 6936.98] - - [320, 256, 11, 3264] - [560, 8630.45] + - - [256, 256, 11, 6528] + - [552, 7354.98] - - [224, 192, 9, 4896] - [584, 6747.03] - - [256, 224, 9, 19584] @@ -199409,6 +200487,8 @@ - [573, 5133.73] - - [256, 224, 11, 6528] - [566, 6509.68] + - - [128, 128, 11, 13056] + - [557, 4411.67] - - [192, 160, 9, 4896] - [586, 5118.14] - - [256, 224, 11, 3264] @@ -199419,6 +200499,8 @@ - [567, 6185.35] - - [256, 256, 9, 19584] - [556, 6147.61] + - - [192, 128, 11, 13056] + - [567, 5112.27] - - [224, 192, 9, 9792] - [554, 6657.91] - - [160, 160, 11, 6528] @@ -199427,6258 +200509,6286 @@ - [566, 7023.59] - - [192, 128, 9, 9792] - [557, 5400.54] + - - [1024, 6400, 1, 65] + - [590, 5298.31] + - - [4096, 6400, 1, 256] + - [591, 9150.88] + - - [4096, 64, 1, 1024] + - [592, 5482.75] - - [1024, 128, 1, 128] - - [602, 896.219] + - [605, 896.219] - - [4, 704, 1, 1280] - - [639, 328.876] + - [642, 328.876] - - [4, 1856, 1, 3328] - - [649, 501.361] + - [652, 501.361] - - [1856, 448, 1, 3328] - - [694, 5677.91] + - [697, 5677.91] - - [2944, 4288, 1, 1280] - - [680, 8412.39] + - [683, 8412.39] - - [2368, 64, 1, 3328] - - [630, 4913.92] + - [633, 4913.92] - - [1760, 32, 1, 1760] - - [657, 3312.94] + - [660, 3312.94] - - [2368, 5888, 1, 256] - - [680, 6489.72] + - [683, 6489.72] - - [5888, 1856, 1, 256] - - [692, 7791.88] + - [695, 7791.88] - - [128, 64, 1, 256] - - [664, 369.217] + - [667, 369.217] - - [512, 24000, 1, 1536] - - [686, 8827.37] + - [689, 8827.37] - - [128, 6784, 1, 3328] - - [686, 6536.99] + - [689, 6536.99] - - [5888, 1408, 1, 256] - - [700, 6129.61] + - [703, 6129.61] - - [5888, 1856, 1, 3328] - - [686, 7969.17] + - [689, 7969.17] - - [5056, 704, 1, 256] - - [686, 6723.82] + - [689, 6723.82] - - [2048, 400, 1, 512] - - [692, 4531.44] + - [695, 4531.44] - - [5888, 2944, 1, 3328] - - [692, 8608.04] + - [695, 8608.04] - - [1856, 4288, 1, 256] - - [692, 6297.54] + - [695, 6297.54] - - [1024, 5056, 1, 128] - - [670, 3595.37] + - [673, 3595.37] - - [5056, 5056, 1, 3328] - - [686, 8559.16] + - [689, 8559.16] - - [1408, 5888, 1, 1280] - - [681, 6797.06] + - [684, 6797.06] - - [2368, 448, 1, 128] - - [670, 2814.9] + - [673, 2814.9] - - [2368, 6784, 1, 128] - - [674, 4781.98] + - [677, 4781.98] - - [1024, 3584, 1, 3328] - - [682, 8402.44] + - [685, 8402.44] - - [512, 48000, 1, 2048] - - [686, 8162.23] + - [689, 8162.23] - - [128, 448, 1, 1280] - - [657, 2903.49] + - [660, 2903.49] - - [256, 4288, 1, 3328] - - [687, 6345.94] + - [690, 6345.94] - - [5888, 1408, 1, 1280] - - [686, 8959.45] + - [689, 8959.45] - - [704, 1856, 1, 3328] - - [681, 6955.27] + - [684, 6955.27] - - [4, 1408, 1, 128] - - [701, 60.0747] + - [704, 60.0747] - - [1024, 2368, 1, 256] - - [688, 5927.78] + - [691, 5927.78] - - [64, 4, 1, 256] - - [706, 13.2129] + - [709, 13.2129] - - [1408, 1856, 1, 1280] - - [684, 8051.58] + - [687, 8051.58] - - [1408, 64, 1, 1280] - - [660, 3400.45] + - [663, 3400.45] - - [448, 1024, 1, 1280] - - [688, 5729.92] + - [691, 5729.92] - - [6144, 24000, 1, 2048] - - [692, 7738.3] + - [695, 7738.3] - - [4096, 32, 1, 4096] - - [630, 2381.43] + - [633, 2381.43] - - [256, 1408, 1, 3328] - - [688, 4844.78] + - [691, 4844.78] - - [5056, 5056, 1, 1280] - - [692, 9090.1] + - [695, 9090.1] - - [448, 5056, 1, 256] - - [698, 4961.18] + - [701, 4961.18] - - [704, 1856, 1, 1280] - - [684, 6456.44] + - [687, 6456.44] - - [128, 5056, 1, 128] - - [613, 2251.02] + - [616, 2251.02] - - [2368, 128, 1, 256] - - [681, 3403.27] + - [684, 3403.27] - - [1760, 6400, 1, 1760] - - [680, 8959.7] + - [683, 8959.7] - - [1856, 1408, 1, 128] - - [673, 3493.06] + - [676, 3493.06] - - [64, 5056, 1, 256] - - [682, 2582.22] + - [685, 2582.22] - - [6784, 256, 1, 3328] - - [680, 7323.54] + - [683, 7323.54] - - [6784, 4288, 1, 3328] - - [682, 8542.09] + - [685, 8542.09] - - [4288, 448, 1, 256] - - [698, 5030.5] + - [701, 5030.5] - - [64, 704, 1, 128] - - [615, 375.467] + - [618, 375.467] - - [1856, 2368, 1, 3328] - - [691, 6742.34] + - [694, 6742.34] - - [4288, 2944, 1, 1280] - - [692, 8578.17] + - [695, 8578.17] - - [704, 5056, 1, 1280] - - [688, 8014.45] + - [691, 8014.45] - - [2368, 704, 1, 3328] - - [687, 6544.31] + - [690, 6544.31] - - [256, 5888, 1, 256] - - [685, 5932.9] + - [688, 5932.9] - - [1856, 4288, 1, 3328] - - [691, 7410.72] + - [694, 7410.72] - - [256, 2944, 1, 256] - - [687, 5013.98] + - [690, 5013.98] - - [5888, 1024, 1, 256] - - [692, 8069.34] + - [695, 8069.34] - - [448, 64, 1, 1280] - - [667, 2057.18] + - [670, 2057.18] - - [3072, 64, 1, 1024] - - [647, 2145.42] + - [650, 2145.42] - - [3584, 4, 1, 1280] - - [639, 498.643] + - [642, 498.643] - - [16384, 3200, 1, 4096] - - [679, 6621.43] + - [682, 6621.43] - - [2944, 64, 1, 256] - - [687, 2554.79] + - [690, 2554.79] - - [128, 4, 1, 1280] - - [649, 87.1489] + - [652, 87.1489] - - [1408, 2944, 1, 256] - - [686, 8029.35] + - [689, 8029.35] - - [256, 1856, 1, 1280] - - [681, 6170.6] + - [684, 6170.6] - - [6784, 5056, 1, 3328] - - [690, 7134.19] + - [693, 7134.19] - - [5056, 5056, 1, 256] - - [698, 6246.8] + - [701, 6246.8] - - [1408, 6784, 1, 128] - - [675, 4329.45] + - [678, 4329.45] - - [64, 1024, 1, 1280] - - [657, 3206.65] + - [660, 3206.65] - - [2944, 4, 1, 256] - - [706, 333.48] + - [709, 333.48] - - [704, 5056, 1, 128] - - [670, 4085.42] + - [673, 4085.42] - - [4, 2368, 1, 1280] - - [707, 394.667] + - [710, 394.667] - - [2368, 2944, 1, 1280] - - [686, 8633.95] + - [689, 8633.95] - - [128, 3584, 1, 1280] - - [687, 6046.15] + - [690, 6046.15] - - [6784, 6784, 1, 1280] - - [692, 8847.41] + - [695, 8847.41] - - [1408, 4288, 1, 1280] - - [692, 8236.69] + - [695, 8236.69] - - [3584, 4288, 1, 1280] - - [687, 7399.88] + - [690, 7399.88] - - [2368, 704, 1, 1280] - - [680, 6754.4] + - [683, 6754.4] - - [5056, 4288, 1, 3328] - - [686, 8569.53] + - [689, 8569.53] - - [3584, 2368, 1, 3328] - - [691, 7942.38] + - [694, 7942.38] - - [64, 704, 1, 1280] - - [660, 2363.59] + - [663, 2363.59] - - [4288, 256, 1, 256] - - [688, 4591.8] + - [691, 4591.8] - - [2944, 128, 1, 128] - - [613, 1878.29] + - [616, 1878.29] - - [6144, 32, 1, 2560] - - [658, 3334.1] + - [661, 3334.1] - - [6784, 448, 1, 1280] - - [690, 7939.2] + - [693, 7939.2] - - [1408, 2944, 1, 128] - - [674, 4096.51] + - [677, 4096.51] - - [4288, 2944, 1, 256] - - [680, 8141.13] + - [683, 8141.13] - - [5888, 704, 1, 1280] - - [681, 7516.13] + - [684, 7516.13] - - [5056, 4, 1, 3328] - - [624, 552.409] + - [627, 552.409] - - [1856, 64, 1, 1280] - - [630, 3870.76] + - [633, 3870.76] - - [1760, 16, 1, 1760] - - [642, 2181.41] + - [645, 2181.41] - - [448, 5888, 1, 128] - - [675, 3371.0] + - [678, 3371.0] - - [5888, 64, 1, 3328] - - [655, 5319.38] + - [658, 5319.38] - - [2944, 256, 1, 3328] - - [687, 7122.3] + - [690, 7122.3] - - [1024, 64, 1, 128] - - [602, 595.782] + - [605, 595.782] - - [5056, 2368, 1, 1280] - - [681, 7778.19] + - [684, 7778.19] - - [448, 3584, 1, 1280] - - [686, 6500.52] + - [689, 6500.52] - - [6784, 5888, 1, 256] - - [686, 8918.58] + - [689, 8918.58] - - [704, 1024, 1, 128] - - [670, 2627.41] + - [673, 2627.41] - - [704, 128, 1, 1280] - - [657, 3408.49] + - [660, 3408.49] - - [4, 3584, 1, 128] - - [701, 140.721] + - [704, 140.721] - - [1408, 448, 1, 1280] - - [681, 5881.44] + - [684, 5881.44] - - [1024, 1408, 1, 256] - - [685, 5647.17] + - [688, 5647.17] - - [2368, 2368, 1, 3328] - - [679, 7688.73] + - [682, 7688.73] - - [1856, 6784, 1, 128] - - [670, 4705.85] + - [673, 4705.85] - - [5056, 704, 1, 3328] - - [690, 8198.88] + - [693, 8198.88] - - [1408, 1856, 1, 256] - - [692, 6339.95] + - [695, 6339.95] - - [1408, 704, 1, 3328] - - [684, 7599.55] + - [687, 7599.55] - - [2368, 5056, 1, 256] - - [692, 8242.75] + - [695, 8242.75] - - [1408, 256, 1, 1280] - - [687, 4879.16] + - [690, 4879.16] - - [3072, 128, 1, 1024] - - [656, 2525.42] + - [659, 2525.42] - - [3584, 2368, 1, 1280] - - [688, 8132.62] + - [691, 8132.62] - - [4288, 64, 1, 3328] - - [643, 5156.43] + - [646, 5156.43] - - [2368, 4, 1, 1280] - - [705, 482.65] + - [708, 482.65] - - [704, 5888, 1, 256] - - [695, 5398.65] + - [698, 5398.65] - - [6784, 2944, 1, 128] - - [671, 4748.89] + - [674, 4748.89] - - [2560, 1600, 1, 2560] - - [682, 7354.9] + - [685, 7354.9] - - [4288, 6784, 1, 3328] - - [679, 7409.31] + - [682, 7409.31] - - [2944, 256, 1, 256] - - [687, 5077.32] + - [690, 5077.32] - - [2944, 6784, 1, 3328] - - [692, 8067.95] + - [695, 8067.95] - - [704, 1408, 1, 3328] - - [687, 7239.33] + - [690, 7239.33] - - [6144, 5984, 1, 2048] - - [686, 7175.97] + - [689, 7175.97] - - [3584, 704, 1, 3328] - - [692, 6642.76] + - [695, 6642.76] - - [2944, 256, 1, 128] - - [671, 2644.44] + - [674, 2644.44] - - [6784, 4, 1, 1280] - - [703, 402.387] + - [706, 402.387] - - [1024, 64, 1, 1280] - - [657, 2601.93] + - [660, 2601.93] - - [2048, 1600, 1, 512] - - [684, 5592.4] + - [687, 5592.4] - - [448, 4288, 1, 256] - - [682, 6128.89] + - [685, 6128.89] - - [64, 3584, 1, 3328] - - [623, 5534.83] + - [626, 5534.83] - - [1856, 4288, 1, 128] - - [673, 4400.01] + - [676, 4400.01] - - [704, 2368, 1, 1280] - - [698, 5734.92] + - [701, 5734.92] - - [1856, 2368, 1, 1280] - - [695, 6482.3] + - [698, 6482.3] - - [2368, 128, 1, 3328] - - [668, 4717.22] + - [671, 4717.22] - - [2944, 128, 1, 256] - - [695, 3276.8] + - [698, 3276.8] - - [448, 1408, 1, 256] - - [687, 4852.18] + - [690, 4852.18] - - [1856, 4288, 1, 1280] - - [682, 8132.86] + - [685, 8132.86] - - [64, 5056, 1, 3328] - - [658, 5096.96] + - [661, 5096.96] - - [4, 704, 1, 256] - - [705, 128.731] + - [708, 128.731] - - [1024, 448, 1, 128] - - [670, 1816.84] + - [673, 1816.84] - - [704, 4, 1, 1280] - - [706, 328.876] + - [709, 328.876] - - [704, 256, 1, 128] - - [674, 876.469] + - [677, 876.469] - - [704, 2944, 1, 128] - - [674, 3734.37] + - [677, 3734.37] - - [1408, 1024, 1, 1280] - - [682, 7224.75] + - [685, 7224.75] - - [704, 6784, 1, 256] - - [686, 7354.67] + - [689, 7354.67] - - [6784, 704, 1, 256] - - [682, 6012.18] + - [685, 6012.18] - - [5056, 1408, 1, 128] - - [675, 4311.18] + - [678, 4311.18] - - [2048, 7000, 1, 2048] - - [686, 7231.97] + - [689, 7231.97] - - [256, 3584, 1, 3328] - - [690, 7005.9] + - [693, 7005.9] - - [4, 5888, 1, 3328] - - [708, 534.512] + - [711, 534.512] - - [128, 1408, 1, 128] - - [600, 1176.97] + - [603, 1176.97] - - [3584, 4288, 1, 3328] - - [692, 7134.9] + - [695, 7134.9] - - [5888, 1856, 1, 1280] - - [680, 8394.93] + - [683, 8394.93] - - [256, 1408, 1, 256] - - [681, 3977.36] + - [684, 3977.36] - - [5056, 64, 1, 1280] - - [681, 4257.68] + - [684, 4257.68] - - [1024, 704, 1, 256] - - [681, 5036.83] + - [684, 5036.83] - - [448, 128, 1, 128] - - [602, 533.433] + - [605, 533.433] - - [2368, 3584, 1, 1280] - - [686, 8272.33] + - [689, 8272.33] - - [2368, 6784, 1, 1280] - - [679, 8288.14] + - [682, 8288.14] - - [1856, 4, 1, 1280] - - [619, 464.0] + - [622, 464.0] - - [448, 448, 1, 256] - - [681, 3058.35] + - [684, 3058.35] - - [2944, 3584, 1, 3328] - - [686, 8557.53] + - [689, 8557.53] - - [7680, 32, 1, 2560] - - [658, 3728.93] + - [661, 3728.93] - - [128, 4288, 1, 128] - - [601, 2116.1] + - [604, 2116.1] - - [256, 256, 1, 3328] - - [657, 4050.96] + - [660, 4050.96] - - [128, 1024, 1, 3328] - - [630, 5139.11] + - [633, 5139.11] - - [4, 1408, 1, 3328] - - [649, 502.771] + - [652, 502.771] - - [6784, 2944, 1, 256] - - [680, 8445.96] + - [683, 8445.96] - - [64, 1856, 1, 1280] - - [622, 3870.76] + - [625, 3870.76] - - [6784, 64, 1, 128] - - [670, 1877.52] + - [673, 1877.52] - - [4288, 2368, 1, 3328] - - [690, 8419.3] + - [693, 8419.3] - - [1856, 2368, 1, 256] - - [684, 6887.38] + - [687, 6887.38] - - [3584, 256, 1, 128] - - [674, 2496.61] + - [677, 2496.61] - - [3584, 6784, 1, 3328] - - [686, 7626.08] + - [689, 7626.08] - - [256, 1024, 1, 256] - - [687, 3095.43] + - [690, 3095.43] - - [4, 6784, 1, 3328] - - [649, 589.174] + - [652, 589.174] - - [1024, 5888, 1, 3328] - - [686, 7794.25] + - [689, 7794.25] - - [1024, 128, 1, 1280] - - [659, 3130.08] + - [662, 3130.08] - - [3072, 32, 1, 1024] - - [646, 1675.49] + - [649, 1675.49] - - [6144, 24000, 1, 2560] - - [686, 7256.04] + - [689, 7256.04] - - [5056, 4288, 1, 1280] - - [684, 8348.93] + - [687, 8348.93] - - [5888, 64, 1, 256] - - [633, 2593.25] + - [636, 2593.25] - - [6784, 1856, 1, 3328] - - [680, 8087.28] + - [683, 8087.28] - - [1408, 5056, 1, 1280] - - [682, 7802.53] + - [685, 7802.53] - - [1856, 256, 1, 1280] - - [687, 6150.63] + - [690, 6150.63] - - [64, 5888, 1, 3328] - - [654, 5301.39] + - [657, 5301.39] - - [2368, 2368, 1, 1280] - - [684, 8233.33] + - [687, 8233.33] - - [2944, 5888, 1, 128] - - [677, 3745.41] + - [680, 3745.41] - - [704, 5888, 1, 1280] - - [682, 8244.94] + - [685, 8244.94] - - [2368, 3584, 1, 128] - - [674, 4523.33] + - [677, 4523.33] - - [1856, 5056, 1, 128] - - [671, 4497.98] + - [674, 4497.98] - - [704, 1024, 1, 1280] - - [695, 5479.49] + - [698, 5479.49] - - [448, 256, 1, 3328] - - [638, 5048.7] + - [641, 5048.7] - - [448, 1856, 1, 128] - - [671, 2936.82] + - [674, 2936.82] - - [8192, 3200, 1, 2048] - - [680, 6713.02] + - [683, 6713.02] - - [128, 1024, 1, 128] - - [616, 998.644] + - [619, 998.644] - - [2944, 4, 1, 128] - - [701, 98.6471] + - [704, 98.6471] - - [1024, 704, 1, 1280] - - [687, 5896.9] + - [690, 5896.9] - - [128, 5888, 1, 256] - - [687, 5013.98] + - [690, 5013.98] - - [1024, 5056, 1, 1280] - - [686, 8857.71] + - [689, 8857.71] - - [4288, 1024, 1, 256] - - [692, 6195.29] + - [695, 6195.29] - - [2944, 2368, 1, 128] - - [670, 4442.13] + - [673, 4442.13] - - [704, 704, 1, 3328] - - [687, 6764.3] + - [690, 6764.3] - - [704, 1408, 1, 1280] - - [688, 7383.48] + - [691, 7383.48] - - [5888, 448, 1, 1280] - - [686, 7299.39] + - [689, 7299.39] - - [3584, 256, 1, 3328] - - [684, 7061.62] + - [687, 7061.62] - - [704, 5888, 1, 3328] - - [688, 8142.32] + - [691, 8142.32] - - [704, 1856, 1, 128] - - [674, 3139.04] + - [677, 3139.04] - - [448, 448, 1, 3328] - - [652, 5063.24] + - [655, 5063.24] - - [4, 4288, 1, 128] - - [702, 64.8775] + - [705, 64.8775] - - [128, 704, 1, 1280] - - [622, 3400.45] + - [625, 3400.45] - - [3584, 2944, 1, 256] - - [692, 7982.04] + - [695, 7982.04] - - [3584, 4, 1, 128] - - [701, 105.218] + - [704, 105.218] - - [1856, 128, 1, 3328] - - [653, 5442.09] + - [656, 5442.09] - - [4, 64, 1, 1280] - - [707, 42.2268] + - [710, 42.2268] - - [2944, 448, 1, 128] - - [670, 2926.85] + - [673, 2926.85] - - [128, 2944, 1, 1280] - - [681, 5109.59] + - [684, 5109.59] - - [64, 64, 1, 3328] - - [649, 1252.89] + - [652, 1252.89] - - [448, 2944, 1, 1280] - - [690, 6684.37] + - [693, 6684.37] - - [512, 24000, 1, 2048] - - [686, 7938.93] + - [689, 7938.93] - - [128, 256, 1, 3328] - - [667, 3276.8] + - [670, 3276.8] - - [1408, 5056, 1, 3328] - - [692, 8959.11] + - [695, 8959.11] - - [1856, 1856, 1, 3328] - - [682, 8006.07] + - [685, 8006.07] - - [3584, 128, 1, 256] - - [687, 4292.42] + - [690, 4292.42] - - [2560, 800, 1, 2560] - - [682, 6262.38] + - [685, 6262.38] - - [448, 1408, 1, 3328] - - [698, 4997.25] + - [701, 4997.25] - - [2368, 2368, 1, 256] - - [700, 4978.84] + - [703, 4978.84] - - [4288, 4288, 1, 1280] - - [679, 8617.68] + - [682, 8617.68] - - [64, 448, 1, 1280] - - [625, 2057.18] + - [628, 2057.18] - - [5888, 1024, 1, 1280] - - [697, 6848.07] + - [700, 6848.07] - - [1408, 4288, 1, 256] - - [680, 7076.91] + - [683, 7076.91] - - [448, 4, 1, 256] - - [705, 84.3294] + - [708, 84.3294] - - [5888, 448, 1, 128] - - [674, 3493.81] + - [677, 3493.81] - - [512, 48000, 1, 2560] - - [692, 8960.03] + - [695, 8960.03] - - [35, 8457, 1, 1760] - - [594, 3934.68] + - [597, 3934.68] - - [704, 6784, 1, 3328] - - [679, 8180.78] + - [682, 8180.78] - - [2560, 6400, 1, 2560] - - [680, 7822.14] + - [683, 7822.14] - - [5056, 1024, 1, 1280] - - [682, 8357.28] + - [685, 8357.28] - - [448, 5888, 1, 3328] - - [686, 7505.18] + - [689, 7505.18] - - [128, 4, 1, 128] - - [701, 0.562251] + - [704, 0.562251] - - [1024, 2944, 1, 1280] - - [686, 8406.14] + - [689, 8406.14] - - [5056, 5888, 1, 1280] - - [686, 8819.66] + - [689, 8819.66] - - [4288, 5888, 1, 128] - - [671, 3522.22] + - [674, 3522.22] - - [256, 3584, 1, 256] - - [682, 5883.79] + - [685, 5883.79] - - [1408, 3584, 1, 128] - - [670, 4283.31] + - [673, 4283.31] - - [256, 2944, 1, 3328] - - [690, 5670.53] + - [693, 5670.53] - - [448, 3584, 1, 128] - - [674, 3171.62] + - [677, 3171.62] - - [5888, 2944, 1, 1280] - - [692, 8198.76] + - [695, 8198.76] - - [4, 6784, 1, 1280] - - [639, 553.796] + - [642, 553.796] - - [2368, 5888, 1, 128] - - [670, 4787.22] + - [673, 4787.22] - - [8448, 16, 1, 2816] - - [629, 2452.53] + - [632, 2452.53] - - [64, 2944, 1, 128] - - [602, 1376.56] + - [605, 1376.56] - - [2368, 4, 1, 256] - - [624, 278.077] + - [627, 278.077] - - [3584, 5888, 1, 256] - - [700, 6233.56] + - [703, 6233.56] - - [2368, 1024, 1, 128] - - [671, 3781.41] + - [674, 3781.41] - - [2368, 704, 1, 128] - - [671, 3198.22] + - [674, 3198.22] - - [3584, 2944, 1, 1280] - - [682, 8045.58] + - [685, 8045.58] - - [3584, 2368, 1, 128] - - [671, 4188.47] + - [674, 4188.47] - - [5056, 704, 1, 128] - - [674, 4019.11] + - [677, 4019.11] - - [448, 2368, 1, 128] - - [676, 2522.11] + - [679, 2522.11] - - [5056, 1408, 1, 3328] - - [684, 8349.83] + - [687, 8349.83] - - [1408, 704, 1, 256] - - [690, 4741.32] + - [693, 4741.32] - - [6784, 1024, 1, 3328] - - [692, 8769.4] + - [695, 8769.4] - - [6784, 2944, 1, 3328] - - [689, 7319.64] + - [692, 7319.64] - - [2944, 5056, 1, 3328] - - [679, 8889.66] + - [682, 8889.66] - - [1856, 1856, 1, 256] - - [682, 6309.74] + - [685, 6309.74] - - [1024, 5888, 1, 128] - - [673, 3759.5] + - [676, 3759.5] - - [6784, 2368, 1, 1280] - - [682, 8298.3] + - [685, 8298.3] - - [256, 4, 1, 128] - - [701, 7.00171] + - [704, 7.00171] - - [4288, 5888, 1, 1280] - - [686, 8365.18] + - [689, 8365.18] - - [4288, 4288, 1, 256] - - [686, 6513.68] + - [689, 6513.68] - - [8448, 32, 1, 2816] - - [657, 4257.64] + - [660, 4257.64] - - [448, 2944, 1, 3328] - - [690, 6875.52] + - [693, 6875.52] - - [5888, 4, 1, 128] - - [701, 163.84] + - [704, 163.84] - - [4288, 1856, 1, 1280] - - [686, 8402.81] + - [689, 8402.81] - - [1856, 2944, 1, 3328] - - [686, 6612.11] + - [689, 6612.11] - - [256, 6784, 1, 3328] - - [687, 7358.6] + - [690, 7358.6] - - [64, 5888, 1, 256] - - [681, 3358.95] + - [684, 3358.95] - - [256, 5056, 1, 128] - - [674, 2489.11] + - [677, 2489.11] - - [5056, 1024, 1, 256] - - [692, 8077.77] + - [695, 8077.77] - - [704, 64, 1, 3328] - - [636, 3288.3] + - [639, 3288.3] - - [5056, 1856, 1, 3328] - - [690, 8171.03] + - [693, 8171.03] - - [4, 2944, 1, 3328] - - [649, 546.743] + - [652, 546.743] - - [4, 5056, 1, 256] - - [624, 378.461] + - [627, 378.461] - - [1856, 1408, 1, 256] - - [692, 6320.78] + - [695, 6320.78] - - [8448, 12000, 1, 2816] - - [690, 7365.77] + - [693, 7365.77] - - [6784, 128, 1, 3328] - - [687, 6366.47] + - [690, 6366.47] - - [4288, 1408, 1, 128] - - [670, 4451.6] + - [673, 4451.6] - - [1856, 5888, 1, 3328] - - [688, 8619.66] + - [691, 8619.66] - - [4288, 5056, 1, 256] - - [692, 7288.95] + - [695, 7288.95] - - [1408, 128, 1, 1280] - - [630, 4291.05] + - [633, 4291.05] - - [4096, 800, 1, 1024] - - [681, 5867.79] + - [684, 5867.79] - - [5056, 256, 1, 3328] - - [687, 7527.51] + - [690, 7527.51] - - [704, 704, 1, 256] - - [687, 4417.75] + - [690, 4417.75] - - [1024, 5888, 1, 1280] - - [692, 8674.47] + - [695, 8674.47] - - [6784, 2368, 1, 128] - - [670, 4723.98] + - [673, 4723.98] - - [4, 5056, 1, 1280] - - [639, 540.207] + - [642, 540.207] - - [256, 64, 1, 1280] - - [641, 1515.28] + - [644, 1515.28] - - [128, 1856, 1, 1280] - - [681, 4574.11] + - [684, 4574.11] - - [1856, 1024, 1, 1280] - - [686, 7741.51] + - [689, 7741.51] - - [6784, 4288, 1, 1280] - - [692, 8521.19] + - [695, 8521.19] - - [2560, 64, 1, 2560] - - [623, 3504.6] + - [626, 3504.6] - - [1856, 1856, 1, 1280] - - [682, 7779.21] + - [685, 7779.21] - - [4096, 400, 1, 1024] - - [692, 4157.71] + - [695, 4157.71] - - [3072, 24000, 1, 1024] - - [692, 8663.35] + - [695, 8663.35] - - [128, 4288, 1, 3328] - - [638, 5674.13] + - [641, 5674.13] - - [4, 2368, 1, 3328] - - [649, 525.38] + - [652, 525.38] - - [5888, 1856, 1, 128] - - [674, 4099.64] + - [677, 4099.64] - - [448, 704, 1, 1280] - - [687, 4309.37] + - [690, 4309.37] - - [128, 5056, 1, 1280] - - [630, 5068.36] + - [633, 5068.36] - - [1024, 448, 1, 3328] - - [690, 6077.72] + - [693, 6077.72] - - [1856, 704, 1, 1280] - - [698, 6257.39] + - [701, 6257.39] - - [5056, 3584, 1, 128] - - [671, 4598.42] + - [674, 4598.42] - - [5888, 5888, 1, 3328] - - [692, 8058.15] + - [695, 8058.15] - - [6784, 1024, 1, 256] - - [692, 5120.89] + - [695, 5120.89] - - [2944, 2368, 1, 256] - - [683, 6522.93] + - [686, 6522.93] - - [256, 448, 1, 256] - - [633, 1816.84] + - [636, 1816.84] - - [5056, 5888, 1, 3328] - - [685, 6722.31] + - [688, 6722.31] - - [1856, 1024, 1, 256] - - [692, 6632.21] + - [695, 6632.21] - - [512, 48000, 1, 1536] - - [686, 8555.91] + - [689, 8555.91] - - [3584, 448, 1, 1280] - - [681, 6566.99] + - [684, 6566.99] - - [8448, 5984, 1, 2816] - - [686, 8990.56] + - [689, 8990.56] - - [448, 5888, 1, 256] - - [686, 6220.37] + - [689, 6220.37] - - [704, 64, 1, 128] - - [599, 450.56] + - [602, 450.56] - - [1408, 6784, 1, 3328] - - [679, 8478.58] + - [682, 8478.58] - - [448, 1024, 1, 128] - - [678, 1844.23] + - [681, 1844.23] - - [4288, 704, 1, 128] - - [674, 3895.16] + - [677, 3895.16] - - [128, 1856, 1, 128] - - [605, 1456.36] + - [608, 1456.36] - - [448, 2368, 1, 3328] - - [684, 5537.94] + - [687, 5537.94] - - [5056, 64, 1, 128] - - [670, 1648.84] + - [673, 1648.84] - - [5056, 2944, 1, 256] - - [686, 8230.77] + - [689, 8230.77] - - [6784, 5888, 1, 128] - - [670, 4873.09] + - [673, 4873.09] - - [1024, 700, 1, 512] - - [684, 4445.27] + - [687, 4445.27] - - [704, 1024, 1, 256] - - [682, 4707.89] + - [685, 4707.89] - - [1024, 4, 1, 256] - - [624, 174.763] + - [627, 174.763] - - [2944, 704, 1, 128] - - [674, 3483.32] + - [677, 3483.32] - - [128, 6784, 1, 1280] - - [682, 6522.83] + - [685, 6522.83] - - [1408, 3584, 1, 3328] - - [686, 8673.49] + - [689, 8673.49] - - [2368, 6784, 1, 256] - - [682, 7941.66] + - [685, 7941.66] - - [5056, 1408, 1, 1280] - - [686, 8800.91] + - [689, 8800.91] - - [256, 256, 1, 128] - - [611, 551.882] + - [614, 551.882] - - [5056, 4288, 1, 128] - - [678, 3793.54] + - [681, 3793.54] - - [1408, 1856, 1, 128] - - [670, 3067.64] + - [673, 3067.64] - - [1408, 5888, 1, 3328] - - [686, 9148.87] + - [689, 9148.87] - - [1856, 256, 1, 256] - - [682, 4319.42] + - [685, 4319.42] - - [6784, 6784, 1, 256] - - [682, 7668.43] + - [685, 7668.43] - - [64, 256, 1, 128] - - [616, 131.072] + - [619, 131.072] - - [4288, 2368, 1, 128] - - [671, 4582.89] + - [674, 4582.89] - - [256, 4288, 1, 1280] - - [681, 6058.51] + - [684, 6058.51] - - [2368, 2944, 1, 256] - - [686, 8015.97] + - [689, 8015.97] - - [4, 1856, 1, 256] - - [703, 252.732] + - [706, 252.732] - - [3584, 1856, 1, 1280] - - [682, 7760.14] + - [685, 7760.14] - - [6784, 6784, 1, 128] - - [671, 4970.04] + - [674, 4970.04] - - [256, 1856, 1, 128] - - [677, 1580.49] + - [680, 1580.49] - - [704, 64, 1, 1280] - - [666, 2556.37] + - [669, 2556.37] - - [5888, 5056, 1, 256] - - [686, 8216.57] + - [689, 8216.57] - - [8448, 48000, 1, 2816] - - [692, 4082.79] + - [695, 4082.79] - - [3584, 448, 1, 256] - - [686, 5518.82] + - [689, 5518.82] - - [448, 4288, 1, 128] - - [674, 3415.15] + - [677, 3415.15] - - [7680, 64, 1, 2560] - - [635, 5162.0] + - [638, 5162.0] - - [256, 6784, 1, 256] - - [686, 6272.52] + - [689, 6272.52] - - [1408, 4288, 1, 128] - - [674, 4343.53] + - [677, 4343.53] - - [2944, 704, 1, 3328] - - [681, 7679.61] + - [684, 7679.61] - - [128, 448, 1, 256] - - [621, 1422.49] + - [624, 1422.49] - - [5056, 256, 1, 1280] - - [688, 5052.29] + - [691, 5052.29] - - [2560, 32, 1, 2560] - - [644, 3105.97] + - [647, 3105.97] - - [3584, 3584, 1, 256] - - [692, 8260.47] + - [695, 8260.47] - - [448, 1408, 1, 128] - - [670, 2397.28] + - [673, 2397.28] - - [128, 256, 1, 1280] - - [625, 2340.57] + - [628, 2340.57] - - [3584, 5056, 1, 256] - - [692, 7347.46] + - [695, 7347.46] - - [6784, 128, 1, 256] - - [682, 5591.0] + - [685, 5591.0] - - [4288, 4, 1, 256] - - [624, 354.106] + - [627, 354.106] - - [704, 448, 1, 256] - - [687, 3492.23] + - [690, 3492.23] - - [2944, 2368, 1, 1280] - - [694, 6661.61] + - [697, 6661.61] - - [448, 64, 1, 3328] - - [666, 3058.35] + - [669, 3058.35] - - [1408, 3584, 1, 256] - - [692, 7966.49] + - [695, 7966.49] - - [3584, 4, 1, 3328] - - [705, 605.459] + - [708, 605.459] - - [6784, 3584, 1, 256] - - [682, 7525.31] + - [685, 7525.31] - - [256, 128, 1, 128] - - [614, 275.941] + - [617, 275.941] - - [704, 1408, 1, 128] - - [671, 3109.75] + - [674, 3109.75] - - [4, 2368, 1, 256] - - [705, 283.275] + - [708, 283.275] - - [4288, 128, 1, 1280] - - [687, 5132.55] + - [690, 5132.55] - - [128, 1408, 1, 256] - - [681, 2733.25] + - [684, 2733.25] - - [4, 2944, 1, 256] - - [703, 314.027] + - [706, 314.027] - - [64, 128, 1, 3328] - - [651, 1514.61] + - [654, 1514.61] - - [5056, 2368, 1, 128] - - [675, 3449.07] + - [678, 3449.07] - - [2944, 2944, 1, 3328] - - [679, 8168.93] + - [682, 8168.93] - - [5056, 6784, 1, 256] - - [699, 5792.67] + - [702, 5792.67] - - [1856, 3584, 1, 128] - - [676, 4213.4] + - [679, 4213.4] - - [128, 2944, 1, 128] - - [600, 1970.36] + - [603, 1970.36] - - [35, 8457, 1, 2560] - - [595, 3525.05] + - [598, 3525.05] - - [1024, 704, 1, 3328] - - [681, 6784.89] + - [684, 6784.89] - - [6784, 448, 1, 256] - - [690, 6544.78] + - [693, 6544.78] - - [3584, 6784, 1, 128] - - [670, 4623.5] + - [673, 4623.5] - - [128, 4288, 1, 256] - - [684, 3606.5] + - [687, 3606.5] - - [704, 448, 1, 3328] - - [681, 4477.91] + - [684, 4477.91] - - [128, 128, 1, 3328] - - [666, 2177.55] + - [669, 2177.55] - - [5056, 1856, 1, 256] - - [700, 5608.62] + - [703, 5608.62] - - [4608, 5984, 1, 1536] - - [689, 7859.75] + - [692, 7859.75] - - [256, 128, 1, 256] - - [625, 998.644] + - [628, 998.644] - - [1760, 3200, 1, 1760] - - [682, 8179.54] + - [685, 8179.54] - - [1024, 1856, 1, 256] - - [692, 6143.17] + - [695, 6143.17] - - [4096, 1600, 1, 1024] - - [700, 5851.42] + - [703, 5851.42] - - [4288, 64, 1, 128] - - [605, 1372.16] + - [608, 1372.16] - - [256, 448, 1, 3328] - - [644, 4795.0] + - [647, 4795.0] - - [1408, 6784, 1, 1280] - - [686, 8426.4] + - [689, 8426.4] - - [3584, 3584, 1, 1280] - - [686, 7556.46] + - [689, 7556.46] - - [7680, 24000, 1, 2560] - - [679, 5019.09] + - [682, 5019.09] - - [64, 2368, 1, 1280] - - [630, 4061.7] + - [633, 4061.7] - - [448, 2368, 1, 1280] - - [681, 5928.67] + - [684, 5928.67] - - [4608, 48000, 1, 1536] - - [686, 6937.3] + - [689, 6937.3] - - [5888, 5888, 1, 128] - - [671, 3743.9] + - [674, 3743.9] - - [64, 6784, 1, 3328] - - [681, 5988.62] + - [684, 5988.62] - - [2944, 256, 1, 1280] - - [687, 6717.87] + - [690, 6717.87] - - [2048, 16, 1, 2048] - - [639, 1210.48] + - [642, 1210.48] - - [256, 2368, 1, 128] - - [674, 1935.97] + - [677, 1935.97] - - [5056, 2368, 1, 3328] - - [692, 8875.53] + - [695, 8875.53] - - [2944, 4288, 1, 256] - - [686, 8063.14] + - [689, 8063.14] - - [1408, 3584, 1, 1280] - - [682, 8196.97] + - [685, 8196.97] - - [2368, 64, 1, 256] - - [681, 2365.69] + - [684, 2365.69] - - [64, 448, 1, 3328] - - [667, 3027.3] + - [670, 3027.3] - - [704, 128, 1, 3328] - - [638, 4452.09] + - [641, 4452.09] - - [8192, 1600, 1, 2048] - - [686, 7229.83] + - [689, 7229.83] - - [1856, 704, 1, 256] - - [688, 5545.35] + - [691, 5545.35] - - [4, 4288, 1, 1280] - - [639, 523.725] + - [642, 523.725] - - [1408, 448, 1, 3328] - - [693, 4789.3] + - [696, 4789.3] - - [1024, 4, 1, 3328] - - [619, 504.123] + - [622, 504.123] - - [512, 24000, 1, 2560] - - [692, 8903.52] + - [695, 8903.52] - - [2368, 6784, 1, 3328] - - [692, 8311.04] + - [695, 8311.04] - - [1856, 1408, 1, 1280] - - [682, 8160.01] + - [685, 8160.01] - - [1856, 448, 1, 1280] - - [684, 6242.97] + - [687, 6242.97] - - [6784, 704, 1, 128] - - [670, 4068.95] + - [673, 4068.95] - - [4, 4, 1, 256] - - [639, 0.742029] + - [642, 0.742029] - - [128, 5888, 1, 128] - - [670, 2327.92] + - [673, 2327.92] - - [1408, 5888, 1, 256] - - [681, 6986.81] + - [684, 6986.81] - - [704, 2944, 1, 1280] - - [682, 7904.93] + - [685, 7904.93] - - [4288, 64, 1, 1280] - - [657, 3828.17] + - [660, 3828.17] - - [256, 64, 1, 256] - - [632, 655.36] + - [635, 655.36] - - [704, 1856, 1, 256] - - [690, 5444.27] + - [693, 5444.27] - - [704, 6784, 1, 128] - - [670, 4319.67] + - [673, 4319.67] - - [3584, 704, 1, 1280] - - [690, 7726.33] + - [693, 7726.33] - - [256, 128, 1, 1280] - - [625, 2184.53] + - [628, 2184.53] - - [5888, 2368, 1, 256] - - [692, 8192.59] + - [695, 8192.59] - - [256, 2368, 1, 1280] - - [687, 5675.44] + - [690, 5675.44] - - [2944, 6784, 1, 128] - - [675, 4248.25] + - [678, 4248.25] - - [3584, 448, 1, 3328] - - [686, 6560.67] + - [689, 6560.67] - - [1408, 4, 1, 256] - - [704, 176.69] + - [707, 176.69] - - [704, 2368, 1, 3328] - - [687, 7085.21] + - [690, 7085.21] - - [2944, 448, 1, 256] - - [683, 3411.9] + - [686, 3411.9] - - [1856, 448, 1, 128] - - [671, 2748.72] + - [674, 2748.72] - - [4288, 4, 1, 3328] - - [639, 553.548] + - [642, 553.548] - - [2368, 128, 1, 1280] - - [660, 4173.55] + - [663, 4173.55] - - [256, 5888, 1, 128] - - [675, 2860.88] + - [678, 2860.88] - - [64, 6784, 1, 256] - - [688, 3637.08] + - [691, 3637.08] - - [64, 5056, 1, 1280] - - [687, 4289.43] + - [690, 4289.43] - - [4, 6784, 1, 128] - - [701, 160.806] + - [704, 160.806] - - [2048, 3200, 1, 512] - - [688, 6926.99] + - [691, 6926.99] - - [2944, 2944, 1, 1280] - - [680, 6267.75] + - [683, 6267.75] - - [5056, 448, 1, 3328] - - [681, 7400.26] + - [684, 7400.26] - - [4, 3584, 1, 1280] - - [639, 499.73] + - [642, 499.73] - - [1408, 128, 1, 128] - - [616, 1037.26] + - [619, 1037.26] - - [6784, 704, 1, 3328] - - [687, 7633.85] + - [690, 7633.85] - - [128, 64, 1, 1280] - - [639, 1170.29] + - [642, 1170.29] - - [2368, 256, 1, 1280] - - [687, 5609.79] + - [690, 5609.79] - - [4, 448, 1, 3328] - - [707, 358.4] + - [710, 358.4] - - [5888, 4288, 1, 128] - - [675, 4521.64] + - [678, 4521.64] - - [4, 5888, 1, 256] - - [639, 353.833] + - [642, 353.833] - - [1408, 2944, 1, 3328] - - [680, 8951.31] + - [683, 8951.31] - - [3584, 704, 1, 128] - - [670, 3395.31] + - [673, 3395.31] - - [4608, 12000, 1, 1536] - - [679, 6609.89] + - [682, 6609.89] - - [64, 1024, 1, 256] - - [625, 1588.75] + - [628, 1588.75] - - [5056, 5056, 1, 128] - - [670, 4080.71] + - [673, 4080.71] - - [2368, 448, 1, 1280] - - [681, 5422.94] + - [684, 5422.94] - - [128, 3584, 1, 256] - - [687, 4705.15] + - [690, 4705.15] - - [704, 448, 1, 1280] - - [684, 3960.97] + - [687, 3960.97] - - [8192, 800, 1, 2048] - - [682, 6306.26] + - [685, 6306.26] - - [448, 5056, 1, 128] - - [674, 3709.46] + - [677, 3709.46] - - [256, 4, 1, 1280] - - [706, 163.84] + - [709, 163.84] - - [5056, 3584, 1, 256] - - [679, 7008.24] + - [682, 7008.24] - - [2368, 4, 1, 3328] - - [639, 496.266] + - [642, 496.266] - - [1408, 5056, 1, 128] - - [674, 4175.27] + - [677, 4175.27] - - [2944, 3584, 1, 128] - - [670, 4659.69] + - [673, 4659.69] - - [3584, 2368, 1, 256] - - [692, 5851.77] + - [695, 5851.77] - - [128, 3584, 1, 3328] - - [682, 6104.94] + - [685, 6104.94] - - [128, 1024, 1, 1280] - - [622, 3847.99] + - [625, 3847.99] - - [8448, 24000, 1, 2816] - - [692, 5128.54] + - [695, 5128.54] - - [64, 704, 1, 256] - - [625, 1253.73] + - [628, 1253.73] - - [4288, 256, 1, 1280] - - [681, 5625.76] + - [684, 5625.76] - - [3584, 3584, 1, 3328] - - [686, 8206.05] + - [689, 8206.05] - - [4, 704, 1, 128] - - [701, 29.4484] + - [704, 29.4484] - - [5888, 6784, 1, 256] - - [688, 8248.65] + - [691, 8248.65] - - [4288, 2944, 1, 3328] - - [686, 8657.02] + - [689, 8657.02] - - [2944, 64, 1, 128] - - [605, 1240.6] + - [608, 1240.6] - - [1024, 128, 1, 3328] - - [630, 4433.0] + - [633, 4433.0] - - [1024, 16, 1, 500000] - - [593, 2571.05] + - [596, 2571.05] - - [4288, 128, 1, 3328] - - [630, 5716.75] + - [633, 5716.75] - - [7680, 128, 1, 2560] - - [628, 5488.0] + - [631, 5488.0] - - [256, 5056, 1, 1280] - - [688, 6379.96] + - [691, 6379.96] - - [1408, 256, 1, 128] - - [674, 1633.73] + - [677, 1633.73] - - [2944, 5888, 1, 3328] - - [683, 7848.92] + - [686, 7848.92] - - [6784, 5888, 1, 1280] - - [692, 9047.62] + - [695, 9047.62] - - [2048, 800, 1, 512] - - [687, 4841.07] + - [690, 4841.07] - - [704, 128, 1, 256] - - [632, 1567.17] + - [635, 1567.17] - - [5888, 4288, 1, 1280] - - [686, 7982.83] + - [689, 7982.83] - - [1024, 24000, 1, 2048] - - [688, 5774.3] + - [691, 5774.3] - - [448, 256, 1, 1280] - - [622, 3707.09] + - [625, 3707.09] - - [5888, 3584, 1, 128] - - [675, 3804.4] + - [678, 3804.4] - - [1024, 2944, 1, 128] - - [670, 3308.26] + - [673, 3308.26] - - [5056, 4, 1, 1280] - - [703, 468.962] + - [706, 468.962] - - [256, 1408, 1, 1280] - - [681, 4899.89] + - [684, 4899.89] - - [3072, 16, 1, 1024] - - [639, 1233.62] + - [642, 1233.62] - - [704, 3584, 1, 128] - - [670, 3919.43] + - [673, 3919.43] - - [5888, 448, 1, 3328] - - [700, 6095.61] + - [703, 6095.61] - - [2368, 4288, 1, 1280] - - [682, 8338.3] + - [685, 8338.3] - - [4288, 2944, 1, 128] - - [674, 3946.5] + - [677, 3946.5] - - [1024, 6784, 1, 3328] - - [688, 7494.28] + - [691, 7494.28] - - [128, 2368, 1, 256] - - [687, 2895.32] + - [690, 2895.32] - - [6784, 64, 1, 3328] - - [681, 5964.89] + - [684, 5964.89] - - [5056, 2944, 1, 3328] - - [692, 6605.53] + - [695, 6605.53] - - [448, 128, 1, 256] - - [625, 1339.42] + - [628, 1339.42] - - [2944, 3584, 1, 256] - - [688, 7165.56] + - [691, 7165.56] - - [1408, 1408, 1, 3328] - - [692, 8332.86] + - [695, 8332.86] - - [1856, 128, 1, 1280] - - [687, 4498.33] + - [690, 4498.33] - - [3584, 3584, 1, 128] - - [671, 4000.01] + - [674, 4000.01] - - [64, 3584, 1, 256] - - [698, 2383.13] + - [701, 2383.13] - - [1408, 4, 1, 3328] - - [649, 422.908] + - [652, 422.908] - - [128, 2944, 1, 3328] - - [654, 5429.93] + - [657, 5429.93] - - [3584, 704, 1, 256] - - [687, 6153.99] + - [690, 6153.99] - - [2944, 448, 1, 3328] - - [687, 6507.72] + - [690, 6507.72] - - [3584, 1408, 1, 3328] - - [692, 8829.63] + - [695, 8829.63] - - [704, 3584, 1, 1280] - - [682, 7860.23] + - [685, 7860.23] - - [2944, 6784, 1, 1280] - - [692, 8894.5] + - [695, 8894.5] - - [1856, 6784, 1, 256] - - [692, 8115.09] + - [695, 8115.09] - - [4288, 448, 1, 3328] - - [684, 6397.25] + - [687, 6397.25] - - [6784, 4288, 1, 128] - - [670, 4109.44] + - [673, 4109.44] - - [6784, 704, 1, 1280] - - [680, 7999.04] + - [683, 7999.04] - - [256, 4288, 1, 256] - - [684, 4603.84] + - [687, 4603.84] - - [3584, 6784, 1, 256] - - [692, 7361.55] + - [695, 7361.55] - - [6144, 12000, 1, 2048] - - [691, 6311.66] + - [694, 6311.66] - - [6144, 16, 1, 2560] - - [640, 2240.55] + - [643, 2240.55] - - [3584, 64, 1, 128] - - [611, 1292.26] + - [614, 1292.26] - - [5888, 1024, 1, 3328] - - [679, 8394.49] + - [682, 8394.49] - - [448, 64, 1, 128] - - [602, 262.144] + - [605, 262.144] - - [704, 6784, 1, 1280] - - [686, 7740.56] + - [689, 7740.56] - - [4, 1024, 1, 1280] - - [639, 378.821] + - [642, 378.821] - - [5888, 128, 1, 256] - - [687, 5003.58] + - [690, 5003.58] - - [4096, 16, 1, 4096] - - [639, 1585.75] + - [642, 1585.75] - - [1856, 5056, 1, 3328] - - [680, 8522.82] + - [683, 8522.82] - - [4, 6784, 1, 256] - - [624, 387.657] + - [627, 387.657] - - [1024, 3584, 1, 128] - - [674, 3031.51] + - [677, 3031.51] - - [1024, 1408, 1, 128] - - [676, 2600.75] + - [679, 2600.75] - - [2368, 2944, 1, 128] - - [673, 4340.16] + - [676, 4340.16] - - [5056, 64, 1, 256] - - [687, 3109.52] + - [690, 3109.52] - - [4, 448, 1, 1280] - - [707, 253.735] + - [710, 253.735] - - [5056, 2944, 1, 128] - - [678, 3739.91] + - [681, 3739.91] - - [5888, 5056, 1, 3328] - - [692, 9016.38] + - [695, 9016.38] - - [1024, 704, 1, 128] - - [674, 2363.56] + - [677, 2363.56] - - [5888, 2368, 1, 128] - - [677, 3651.73] + - [680, 3651.73] - - [128, 5056, 1, 3328] - - [681, 6243.54] + - [684, 6243.54] - - [3584, 6784, 1, 1280] - - [679, 9080.57] + - [682, 9080.57] - - [448, 4, 1, 1280] - - [707, 242.983] + - [710, 242.983] - - [1856, 5888, 1, 256] - - [692, 8182.02] + - [695, 8182.02] - - [256, 256, 1, 256] - - [625, 1542.02] + - [628, 1542.02] - - [256, 64, 1, 128] - - [606, 135.126] + - [609, 135.126] - - [4288, 4288, 1, 3328] - - [692, 8674.54] + - [695, 8674.54] - - [4288, 1408, 1, 1280] - - [680, 7867.08] + - [683, 7867.08] - - [3584, 5056, 1, 128] - - [670, 4457.73] + - [673, 4457.73] - - [4, 1024, 1, 3328] - - [619, 440.294] + - [622, 440.294] - - [4288, 2368, 1, 256] - - [700, 5699.47] + - [703, 5699.47] - - [2944, 5056, 1, 1280] - - [692, 8236.46] + - [695, 8236.46] - - [448, 6784, 1, 256] - - [682, 6620.52] + - [685, 6620.52] - - [64, 128, 1, 128] - - [607, 67.5629] + - [610, 67.5629] - - [1856, 2368, 1, 128] - - [674, 4233.6] + - [677, 4233.6] - - [6784, 2368, 1, 3328] - - [692, 8269.8] + - [695, 8269.8] - - [256, 1024, 1, 1280] - - [681, 4882.78] + - [684, 4882.78] - - [704, 4, 1, 128] - - [701, 19.011] + - [704, 19.011] - - [256, 4, 1, 256] - - [639, 46.8114] + - [642, 46.8114] - - [4288, 128, 1, 256] - - [687, 4273.39] + - [690, 4273.39] - - [4288, 1856, 1, 3328] - - [682, 8195.71] + - [685, 8195.71] - - [3584, 448, 1, 128] - - [675, 2750.55] + - [678, 2750.55] - - [2048, 1600, 1, 2048] - - [698, 5753.49] + - [701, 5753.49] - - [256, 4, 1, 3328] - - [708, 297.878] + - [711, 297.878] - - [4, 1408, 1, 1280] - - [706, 402.286] + - [709, 402.286] - - [3584, 64, 1, 1280] - - [695, 4096.0] + - [698, 4096.0] - - [1408, 448, 1, 128] - - [670, 2498.15] + - [673, 2498.15] - - [3584, 1024, 1, 1280] - - [692, 7252.08] + - [695, 7252.08] - - [1856, 5056, 1, 256] - - [686, 7711.49] + - [689, 7711.49] - - [4, 3584, 1, 256] - - [703, 314.214] + - [706, 314.214] - - [4, 2944, 1, 1280] - - [639, 483.118] + - [642, 483.118] - - [1024, 4288, 1, 256] - - [691, 6544.42] + - [694, 6544.42] - - [5888, 3584, 1, 3328] - - [680, 8105.05] + - [683, 8105.05] - - [1856, 4, 1, 256] - - [639, 252.732] + - [642, 252.732] - - [4, 256, 1, 256] - - [624, 48.1882] + - [627, 48.1882] - - [5056, 3584, 1, 3328] - - [685, 7354.7] + - [688, 7354.7] - - [704, 448, 1, 128] - - [678, 1233.81] + - [681, 1233.81] - - [2368, 1408, 1, 1280] - - [686, 6654.14] + - [689, 6654.14] - - [5056, 2944, 1, 1280] - - [692, 8505.62] + - [695, 8505.62] - - [4, 4, 1, 128] - - [702, 0.0478505] + - [705, 0.0478505] - - [3584, 256, 1, 256] - - [684, 4616.37] + - [687, 4616.37] - - [1024, 6784, 1, 256] - - [686, 7944.88] + - [689, 7944.88] - - [4, 128, 1, 256] - - [639, 29.2571] + - [642, 29.2571] - - [64, 64, 1, 1280] - - [650, 642.51] + - [653, 642.51] - - [5124, 9124, 1, 2048] - - [692, 8019.3] + - [695, 8019.3] - - [6784, 4, 1, 128] - - [701, 192.967] + - [704, 192.967] - - [2944, 1408, 1, 128] - - [670, 3827.03] + - [673, 3827.03] - - [448, 128, 1, 3328] - - [643, 4063.9] + - [646, 4063.9] - - [3584, 1408, 1, 1280] - - [692, 7180.73] + - [695, 7180.73] - - [64, 4288, 1, 3328] - - [638, 4786.74] + - [641, 4786.74] - - [5056, 6784, 1, 3328] - - [679, 7889.73] + - [682, 7889.73] - - [128, 2944, 1, 256] - - [682, 3599.59] + - [685, 3599.59] - - [128, 6784, 1, 128] - - [600, 2606.69] + - [603, 2606.69] - - [3584, 4288, 1, 256] - - [686, 7299.71] + - [689, 7299.71] - - [448, 1856, 1, 256] - - [682, 5206.97] + - [685, 5206.97] - - [1856, 6784, 1, 3328] - - [684, 8386.26] + - [687, 8386.26] - - [3584, 128, 1, 3328] - - [628, 5589.94] + - [631, 5589.94] - - [64, 1856, 1, 256] - - [621, 1949.28] + - [624, 1949.28] - - [64, 448, 1, 256] - - [626, 955.733] + - [629, 955.733] - - [5888, 4288, 1, 256] - - [690, 7791.74] + - [693, 7791.74] - - [4, 448, 1, 128] - - [701, 8.74146] + - [704, 8.74146] - - [5056, 1408, 1, 256] - - [692, 5153.91] + - [695, 5153.91] - - [35, 8457, 1, 2048] - - [597, 3182.47] + - [600, 3182.47] - - [64, 256, 1, 1280] - - [646, 1713.36] + - [649, 1713.36] - - [3584, 1024, 1, 256] - - [682, 6528.08] + - [685, 6528.08] - - [256, 704, 1, 256] - - [681, 2720.36] + - [684, 2720.36] - - [5888, 5888, 1, 256] - - [690, 7992.16] + - [693, 7992.16] - - [4288, 1024, 1, 1280] - - [684, 7837.4] + - [687, 7837.4] - - [5888, 128, 1, 3328] - - [687, 7181.03] + - [690, 7181.03] - - [448, 6784, 1, 3328] - - [681, 7663.0] + - [684, 7663.0] - - [2944, 1408, 1, 1280] - - [690, 7903.04] + - [693, 7903.04] - - [64, 128, 1, 1280] - - [639, 1191.56] + - [642, 1191.56] - - [2944, 1856, 1, 3328] - - [680, 7844.31] + - [683, 7844.31] - - [2368, 64, 1, 128] - - [611, 997.873] + - [614, 997.873] - - [256, 1024, 1, 128] - - [670, 1215.74] + - [673, 1215.74] - - [3584, 5888, 1, 1280] - - [679, 8958.84] + - [682, 8958.84] - - [64, 4, 1, 128] - - [702, 1.11608] + - [705, 1.11608] - - [6784, 1856, 1, 1280] - - [679, 6728.7] + - [682, 6728.7] - - [2944, 5056, 1, 256] - - [692, 8275.11] + - [695, 8275.11] - - [4288, 4, 1, 128] - - [701, 147.544] + - [704, 147.544] - - [5888, 256, 1, 3328] - - [688, 7094.1] + - [691, 7094.1] - - [2944, 4288, 1, 128] - - [673, 4611.45] + - [676, 4611.45] - - [3584, 1408, 1, 256] - - [683, 6542.96] + - [686, 6542.96] - - [704, 3584, 1, 3328] - - [682, 8117.1] + - [685, 8117.1] - - [4096, 3200, 1, 1024] - - [697, 6656.03] + - [700, 6656.03] - - [5056, 448, 1, 1280] - - [695, 6096.1] + - [698, 6096.1] - - [3584, 1856, 1, 3328] - - [680, 8552.31] + - [683, 8552.31] - - [4288, 6784, 1, 1280] - - [686, 8212.36] + - [689, 8212.36] - - [2560, 7000, 1, 2560] - - [688, 7655.24] + - [691, 7655.24] - - [1408, 704, 1, 1280] - - [684, 5756.69] + - [687, 5756.69] - - [2944, 1024, 1, 256] - - [692, 6880.81] + - [695, 6880.81] - - [6784, 64, 1, 256] - - [687, 4438.86] + - [690, 4438.86] - - [2368, 4288, 1, 3328] - - [688, 8377.89] + - [691, 8377.89] - - [4, 1408, 1, 256] - - [705, 222.499] + - [708, 222.499] - - [1024, 1408, 1, 1280] - - [682, 6339.28] + - [685, 6339.28] - - [64, 64, 1, 256] - - [639, 187.246] + - [642, 187.246] - - [704, 256, 1, 3328] - - [681, 4046.04] + - [684, 4046.04] - - [6784, 5056, 1, 256] - - [692, 7972.07] + - [695, 7972.07] - - [1856, 1856, 1, 128] - - [676, 3716.51] + - [679, 3716.51] - - [3584, 5056, 1, 3328] - - [692, 8684.66] + - [695, 8684.66] - - [448, 6784, 1, 128] - - [674, 3828.95] + - [677, 3828.95] - - [4, 704, 1, 3328] - - [707, 393.106] + - [710, 393.106] - - [35, 8457, 1, 4096] - - [596, 3173.14] + - [599, 3173.14] - - [448, 2944, 1, 256] - - [690, 5553.31] + - [693, 5553.31] - - [4, 4288, 1, 3328] - - [649, 573.111] + - [652, 573.111] - - [2944, 6784, 1, 256] - - [686, 8565.96] + - [689, 8565.96] - - [2944, 2944, 1, 128] - - [670, 4540.73] + - [673, 4540.73] - - [4, 4, 1, 1280] - - [649, 3.04762] + - [652, 3.04762] - - [1856, 3584, 1, 1280] - - [686, 7306.26] + - [689, 7306.26] - - [64, 2944, 1, 256] - - [698, 2292.51] + - [701, 2292.51] - - [448, 256, 1, 128] - - [607, 797.83] + - [610, 797.83] - - [4288, 448, 1, 128] - - [673, 3430.4] + - [676, 3430.4] - - [4608, 24000, 1, 1536] - - [691, 6820.14] + - [694, 6820.14] - - [1856, 1408, 1, 3328] - - [694, 6600.14] + - [697, 6600.14] - - [128, 128, 1, 128] - - [599, 161.817] + - [602, 161.817] - - [1024, 4288, 1, 3328] - - [682, 7936.98] + - [685, 7936.98] - - [448, 2368, 1, 256] - - [690, 4526.35] + - [693, 4526.35] - - [1024, 4, 1, 128] - - [702, 16.8907] + - [705, 16.8907] - - [64, 1408, 1, 1280] - - [622, 3345.22] + - [625, 3345.22] - - [64, 6784, 1, 1280] - - [687, 5526.5] + - [690, 5526.5] - - [5056, 448, 1, 256] - - [681, 4216.55] + - [684, 4216.55] - - [2944, 2368, 1, 3328] - - [692, 7000.32] + - [695, 7000.32] - - [704, 4288, 1, 3328] - - [698, 6414.33] + - [701, 6414.33] - - [1408, 128, 1, 256] - - [681, 2720.36] + - [684, 2720.36] - - [1024, 1856, 1, 1280] - - [692, 7682.83] + - [695, 7682.83] - - [2048, 6400, 1, 2048] - - [688, 7418.12] + - [691, 7418.12] - - [512, 48000, 1, 2816] - - [692, 8884.67] + - [695, 8884.67] - - [5124, 9124, 1, 2560] - - [684, 6040.7] + - [687, 6040.7] - - [128, 2368, 1, 3328] - - [638, 5025.56] + - [641, 5025.56] - - [1024, 5888, 1, 256] - - [686, 7322.11] + - [689, 7322.11] - - [64, 2944, 1, 1280] - - [622, 4222.21] + - [625, 4222.21] - - [5056, 64, 1, 3328] - - [663, 4936.22] + - [666, 4936.22] - - [128, 704, 1, 128] - - [608, 683.314] + - [611, 683.314] - - [1408, 2368, 1, 256] - - [687, 6404.12] + - [690, 6404.12] - - [1408, 1408, 1, 256] - - [692, 4537.83] + - [695, 4537.83] - - [4, 64, 1, 128] - - [701, 2.46747] + - [704, 2.46747] - - [64, 1024, 1, 128] - - [600, 532.272] + - [603, 532.272] - - [1024, 8, 1, 500000] - - [590, 1684.98] + - [593, 1684.98] - - [2368, 2368, 1, 128] - - [671, 4334.23] + - [674, 4334.23] - - [64, 5888, 1, 128] - - [600, 2003.09] + - [603, 2003.09] - - [5888, 4, 1, 3328] - - [618, 339.018] + - [621, 339.018] - - [6784, 1408, 1, 128] - - [674, 4431.13] + - [677, 4431.13] - - [4288, 5888, 1, 256] - - [692, 7800.78] + - [695, 7800.78] - - [1408, 5056, 1, 256] - - [686, 8153.28] + - [689, 8153.28] - - [5056, 128, 1, 3328] - - [643, 5829.83] + - [646, 5829.83] - - [128, 128, 1, 1280] - - [646, 1691.25] + - [649, 1691.25] - - [448, 704, 1, 256] - - [687, 3364.18] + - [690, 3364.18] - - [4288, 3584, 1, 128] - - [671, 2952.58] + - [674, 2952.58] - - [2944, 128, 1, 3328] - - [643, 5620.72] + - [646, 5620.72] - - [64, 1408, 1, 3328] - - [644, 4169.81] + - [647, 4169.81] - - [3584, 5056, 1, 1280] - - [689, 7780.66] + - [692, 7780.66] - - [256, 448, 1, 1280] - - [622, 3929.35] + - [625, 3929.35] - - [704, 704, 1, 128] - - [670, 2346.07] + - [673, 2346.07] - - [5056, 4, 1, 128] - - [701, 144.457] + - [704, 144.457] - - [704, 256, 1, 1280] - - [690, 2283.12] + - [693, 2283.12] - - [64, 2368, 1, 3328] - - [622, 4921.59] + - [625, 4921.59] - - [1856, 1024, 1, 128] - - [671, 3459.47] + - [674, 3459.47] - - [1856, 64, 1, 128] - - [603, 918.137] + - [606, 918.137] - - [4096, 64, 1, 4096] - - [648, 4000.52] + - [651, 4000.52] - - [1024, 24000, 1, 1536] - - [684, 8502.26] + - [687, 8502.26] - - [704, 4288, 1, 256] - - [688, 6003.73] + - [691, 6003.73] - - [5888, 2368, 1, 1280] - - [679, 8801.2] + - [682, 8801.2] - - [128, 256, 1, 256] - - [633, 1069.98] + - [636, 1069.98] - - [64, 128, 1, 256] - - [639, 374.491] + - [642, 374.491] - - [2368, 5888, 1, 1280] - - [682, 8308.53] + - [685, 8308.53] - - [5888, 256, 1, 1280] - - [690, 7154.32] + - [693, 7154.32] - - [1760, 128, 1, 1760] - - [631, 5363.81] + - [634, 5363.81] - - [4, 5888, 1, 1280] - - [639, 542.204] + - [642, 542.204] - - [704, 128, 1, 128] - - [611, 779.347] + - [614, 779.347] - - [1024, 4, 1, 1280] - - [639, 392.431] + - [642, 392.431] - - [2368, 1856, 1, 3328] - - [682, 7975.22] + - [685, 7975.22] - - [2368, 128, 1, 128] - - [604, 1584.86] + - [607, 1584.86] - - [2944, 704, 1, 256] - - [690, 4039.11] + - [693, 4039.11] - - [5056, 128, 1, 128] - - [670, 2575.79] + - [673, 2575.79] - - [2368, 1024, 1, 3328] - - [698, 6165.44] + - [701, 6165.44] - - [256, 704, 1, 3328] - - [681, 4028.64] + - [684, 4028.64] - - [704, 3584, 1, 256] - - [692, 6102.82] + - [695, 6102.82] - - [704, 2944, 1, 3328] - - [682, 8202.74] + - [685, 8202.74] - - [6784, 1024, 1, 128] - - [674, 4386.3] + - [677, 4386.3] - - [256, 448, 1, 128] - - [611, 834.095] + - [614, 834.095] - - [448, 1024, 1, 3328] - - [699, 5412.38] + - [702, 5412.38] - - [2944, 1024, 1, 3328] - - [692, 6265.77] + - [695, 6265.77] - - [2944, 5056, 1, 128] - - [670, 4770.78] + - [673, 4770.78] - - [2368, 256, 1, 256] - - [687, 3975.13] + - [690, 3975.13] - - [1408, 6784, 1, 256] - - [686, 7986.92] + - [689, 7986.92] - - [6784, 1408, 1, 3328] - - [686, 8472.61] + - [689, 8472.61] - - [4288, 6784, 1, 128] - - [677, 3865.1] + - [680, 3865.1] - - [704, 64, 1, 256] - - [625, 1287.31] + - [628, 1287.31] - - [5888, 4, 1, 1280] - - [624, 509.922] + - [627, 509.922] - - [256, 2368, 1, 3328] - - [687, 5837.55] + - [690, 5837.55] - - [6784, 2944, 1, 1280] - - [692, 8560.44] + - [695, 8560.44] - - [4288, 1856, 1, 128] - - [670, 4616.97] + - [673, 4616.97] - - [1856, 2944, 1, 128] - - [670, 4287.63] + - [673, 4287.63] - - [6784, 448, 1, 128] - - [674, 3893.33] + - [677, 3893.33] - - [64, 3584, 1, 128] - - [600, 1609.66] + - [603, 1609.66] - - [448, 5056, 1, 1280] - - [690, 7124.31] + - [693, 7124.31] - - [2368, 1856, 1, 128] - - [673, 4004.55] + - [676, 4004.55] - - [64, 2944, 1, 3328] - - [623, 5086.38] + - [626, 5086.38] - - [4288, 704, 1, 256] - - [688, 6176.47] + - [691, 6176.47] - - [256, 3584, 1, 128] - - [671, 2553.05] + - [674, 2553.05] - - [5888, 704, 1, 256] - - [687, 6781.41] + - [690, 6781.41] - - [3584, 1024, 1, 128] - - [674, 3660.85] + - [677, 3660.85] - - [256, 5888, 1, 3328] - - [690, 7772.03] + - [693, 7772.03] - - [1408, 4288, 1, 3328] - - [686, 8832.76] + - [689, 8832.76] - - [6784, 4288, 1, 256] - - [692, 8566.04] + - [695, 8566.04] - - [4288, 256, 1, 128] - - [672, 1953.69] + - [675, 1953.69] - - [5888, 256, 1, 256] - - [690, 3730.43] + - [693, 3730.43] - - [6784, 1024, 1, 1280] - - [686, 8578.29] + - [689, 8578.29] - - [5888, 1024, 1, 128] - - [671, 4092.86] + - [674, 4092.86] - - [1024, 128, 1, 256] - - [621, 1897.88] + - [624, 1897.88] - - [512, 16, 1, 500000] - - [592, 2363.69] + - [595, 2363.69] - - [128, 64, 1, 3328] - - [649, 1592.46] + - [652, 1592.46] - - [448, 64, 1, 256] - - [639, 976.068] + - [642, 976.068] - - [2368, 256, 1, 128] - - [674, 2094.89] + - [677, 2094.89] - - [6784, 3584, 1, 1280] - - [686, 8570.06] + - [689, 8570.06] - - [1024, 6784, 1, 1280] - - [692, 8203.47] + - [695, 8203.47] - - [2944, 64, 1, 1280] - - [630, 4300.51] + - [633, 4300.51] - - [1408, 2944, 1, 1280] - - [682, 7349.54] + - [685, 7349.54] - - [256, 1856, 1, 256] - - [681, 4649.65] + - [684, 4649.65] - - [2048, 800, 1, 2048] - - [700, 4668.63] + - [703, 4668.63] - - [1408, 2368, 1, 3328] - - [690, 7537.64] + - [693, 7537.64] - - [2944, 4, 1, 3328] - - [639, 514.042] + - [642, 514.042] - - [128, 1408, 1, 3328] - - [631, 4991.54] + - [634, 4991.54] - - [2944, 1856, 1, 128] - - [670, 4317.29] + - [673, 4317.29] - - [256, 2944, 1, 128] - - [670, 2258.17] + - [673, 2258.17] - - [256, 6784, 1, 128] - - [670, 3146.92] + - [673, 3146.92] - - [2368, 4, 1, 128] - - [702, 33.8286] + - [705, 33.8286] - - [1408, 256, 1, 3328] - - [681, 5077.75] + - [684, 5077.75] - - [1856, 4, 1, 128] - - [702, 21.4025] + - [705, 21.4025] - - [5056, 6784, 1, 128] - - [670, 4945.01] + - [673, 4945.01] - - [4288, 5056, 1, 128] - - [673, 4729.77] + - [676, 4729.77] - - [1856, 5888, 1, 128] - - [670, 4707.86] + - [673, 4707.86] - - [2944, 5888, 1, 256] - - [684, 8014.68] + - [687, 8014.68] - - [3584, 1856, 1, 256] - - [686, 7567.03] + - [689, 7567.03] - - [4288, 3584, 1, 1280] - - [679, 8726.33] + - [682, 8726.33] - - [2368, 448, 1, 256] - - [687, 4227.6] + - [690, 4227.6] - - [4288, 256, 1, 3328] - - [688, 5487.31] + - [691, 5487.31] - - [1856, 704, 1, 128] - - [674, 3124.96] + - [677, 3124.96] - - [1408, 64, 1, 256] - - [634, 1619.99] + - [637, 1619.99] - - [64, 1856, 1, 128] - - [598, 955.047] + - [601, 955.047] - - [4, 256, 1, 128] - - [701, 10.7789] + - [704, 10.7789] - - [2560, 16, 1, 2560] - - [646, 2019.6] + - [649, 2019.6] - - [704, 5888, 1, 128] - - [675, 3976.16] + - [678, 3976.16] - - [6784, 3584, 1, 128] - - [674, 4018.81] + - [677, 4018.81] - - [1024, 64, 1, 256] - - [639, 1370.69] + - [642, 1370.69] - - [64, 2368, 1, 256] - - [681, 2255.66] + - [684, 2255.66] - - [4288, 5056, 1, 3328] - - [686, 8368.59] + - [689, 8368.59] - - [4, 1856, 1, 1280] - - [639, 392.026] + - [642, 392.026] - - [4288, 128, 1, 128] - - [604, 2286.93] + - [607, 2286.93] - - [1408, 1408, 1, 128] - - [674, 3233.38] + - [677, 3233.38] - - [7680, 16, 1, 2560] - - [642, 2257.27] + - [645, 2257.27] - - [1856, 128, 1, 128] - - [604, 1532.7] + - [607, 1532.7] - - [5056, 2368, 1, 256] - - [686, 8167.19] + - [689, 8167.19] - - [4288, 704, 1, 3328] - - [692, 6411.06] + - [695, 6411.06] - - [448, 3584, 1, 256] - - [692, 5477.64] + - [695, 5477.64] - - [2368, 64, 1, 1280] - - [622, 3936.42] + - [625, 3936.42] - - [2368, 1024, 1, 1280] - - [688, 7688.72] + - [691, 7688.72] - - [2944, 1408, 1, 3328] - - [679, 7668.68] + - [682, 7668.68] - - [1408, 448, 1, 256] - - [681, 4863.88] + - [684, 4863.88] - - [1024, 1408, 1, 3328] - - [690, 7448.89] + - [693, 7448.89] - - [2944, 5888, 1, 1280] - - [680, 8208.47] + - [683, 8208.47] - - [1408, 4, 1, 1280] - - [619, 479.319] + - [622, 479.319] - - [5888, 3584, 1, 256] - - [680, 8609.99] + - [683, 8609.99] - - [2368, 5056, 1, 128] - - [677, 3726.15] + - [680, 3726.15] - - [1408, 1856, 1, 3328] - - [681, 7829.38] + - [684, 7829.38] - - [4, 4, 1, 3328] - - [708, 4.29419] + - [711, 4.29419] - - [6784, 1408, 1, 1280] - - [681, 7690.7] + - [684, 7690.7] - - [4096, 7000, 1, 4096] - - [693, 6272.39] + - [696, 6272.39] - - [704, 2944, 1, 256] - - [682, 6095.81] + - [685, 6095.81] - - [4288, 64, 1, 256] - - [647, 2121.21] + - [650, 2121.21] - - [6784, 5888, 1, 3328] - - [686, 8955.5] + - [689, 8955.5] - - [2368, 4288, 1, 128] - - [670, 4699.55] + - [673, 4699.55] - - [64, 4288, 1, 1280] - - [660, 4013.63] + - [663, 4013.63] - - [6784, 64, 1, 1280] - - [681, 5418.73] + - [684, 5418.73] - - [3584, 128, 1, 128] - - [610, 2165.2] + - [613, 2165.2] - - [1024, 6784, 1, 128] - - [671, 3765.2] + - [674, 3765.2] - - [4, 1856, 1, 128] - - [702, 33.2728] + - [705, 33.2728] - - [1408, 64, 1, 3328] - - [643, 4489.41] + - [646, 4489.41] - - [6784, 4, 1, 256] - - [639, 400.162] + - [642, 400.162] - - [1408, 1408, 1, 1280] - - [686, 8139.43] + - [689, 8139.43] - - [16384, 400, 1, 4096] - - [690, 6087.18] + - [693, 6087.18] - - [256, 2368, 1, 256] - - [681, 4766.25] + - [684, 4766.25] - - [448, 4288, 1, 3328] - - [688, 7576.98] + - [691, 7576.98] - - [2368, 1408, 1, 256] - - [684, 5284.43] + - [687, 5284.43] - - [5888, 5056, 1, 128] - - [671, 3643.5] + - [674, 3643.5] - - [704, 2368, 1, 256] - - [686, 5334.63] + - [689, 5334.63] - - [1024, 24000, 1, 2560] - - [694, 7437.96] + - [697, 7437.96] - - [2944, 448, 1, 1280] - - [695, 4937.43] + - [698, 4937.43] - - [5888, 2368, 1, 3328] - - [680, 8201.74] + - [683, 8201.74] - - [5124, 9124, 1, 1760] - - [687, 6763.96] + - [690, 6763.96] - - [448, 1408, 1, 1280] - - [681, 5881.44] + - [684, 5881.44] - - [448, 1856, 1, 1280] - - [688, 6225.46] + - [691, 6225.46] - - [4288, 448, 1, 1280] - - [690, 5626.27] + - [693, 5626.27] - - [5888, 704, 1, 3328] - - [684, 7873.52] + - [687, 7873.52] - - [5056, 256, 1, 128] - - [675, 2920.93] + - [678, 2920.93] - - [1856, 256, 1, 128] - - [677, 1995.32] + - [680, 1995.32] - - [64, 1408, 1, 128] - - [598, 758.838] + - [601, 758.838] - - [704, 4, 1, 256] - - [639, 130.597] + - [642, 130.597] - - [1408, 5888, 1, 128] - - [670, 4573.95] + - [673, 4573.95] - - [7680, 12000, 1, 2560] - - [686, 8747.03] + - [689, 8747.03] - - [1408, 1024, 1, 256] - - [683, 4609.13] + - [686, 4609.13] - - [8192, 400, 1, 2048] - - [695, 5283.15] + - [698, 5283.15] - - [1024, 1856, 1, 128] - - [670, 2686.28] + - [673, 2686.28] - - [256, 704, 1, 128] - - [670, 1004.73] + - [673, 1004.73] - - [2560, 128, 1, 2560] - - [648, 4259.04] + - [651, 4259.04] - - [448, 1024, 1, 256] - - [681, 4813.14] + - [684, 4813.14] - - [128, 4, 1, 3328] - - [707, 128.308] + - [710, 128.308] - - [5056, 6784, 1, 1280] - - [689, 6579.75] + - [692, 6579.75] - - [1408, 64, 1, 128] - - [611, 819.2] + - [614, 819.2] - - [1024, 448, 1, 1280] - - [690, 5703.21] + - [693, 5703.21] - - [704, 5056, 1, 3328] - - [682, 7574.39] + - [685, 7574.39] - - [128, 5056, 1, 256] - - [681, 5113.43] + - [684, 5113.43] - - [64, 1024, 1, 3328] - - [666, 3980.0] + - [669, 3980.0] - - [1856, 4, 1, 3328] - - [620, 433.153] + - [623, 433.153] - - [4, 2944, 1, 128] - - [702, 46.5225] + - [705, 46.5225] - - [2368, 2944, 1, 3328] - - [680, 9002.03] + - [683, 9002.03] - - [448, 448, 1, 1280] - - [622, 3969.42] + - [625, 3969.42] - - [2368, 3584, 1, 256] - - [692, 7806.29] + - [695, 7806.29] - - [5056, 3584, 1, 1280] - - [679, 8971.46] + - [682, 8971.46] - - [5124, 9124, 1, 4096] - - [692, 7208.62] + - [695, 7208.62] - - [7680, 48000, 1, 2560] - - [686, 3835.81] + - [689, 3835.81] - - [448, 4, 1, 3328] - - [707, 409.6] + - [710, 409.6] - - [1856, 2944, 1, 1280] - - [679, 7173.61] + - [682, 7173.61] - - [1024, 48000, 1, 2816] - - [686, 8976.16] + - [689, 8976.16] - - [128, 1024, 1, 256] - - [625, 1969.16] + - [628, 1969.16] - - [2944, 1408, 1, 256] - - [688, 4585.02] + - [691, 4585.02] - - [4288, 1408, 1, 3328] - - [682, 8237.17] + - [685, 8237.17] - - [3584, 64, 1, 3328] - - [628, 5183.06] + - [631, 5183.06] - - [5888, 2944, 1, 128] - - [677, 3674.46] + - [680, 3674.46] - - [2944, 1024, 1, 128] - - [674, 3834.22] + - [677, 3834.22] - - [4288, 5056, 1, 1280] - - [686, 8086.0] + - [689, 8086.0] - - [5888, 6784, 1, 1280] - - [680, 6941.22] + - [683, 6941.22] - - [6784, 5056, 1, 128] - - [671, 4860.05] + - [674, 4860.05] - - [256, 1024, 1, 3328] - - [695, 5156.12] + - [698, 5156.12] - - [3584, 4, 1, 256] - - [639, 332.429] + - [642, 332.429] - - [1760, 1600, 1, 1760] - - [682, 6330.66] + - [685, 6330.66] - - [1856, 64, 1, 3328] - - [643, 4755.93] + - [646, 4755.93] - - [4, 128, 1, 3328] - - [707, 160.144] + - [710, 160.144] - - [5888, 1408, 1, 3328] - - [680, 8722.64] + - [683, 8722.64] - - [448, 2944, 1, 128] - - [673, 2997.53] + - [676, 2997.53] - - [2368, 1856, 1, 256] - - [681, 6662.24] + - [684, 6662.24] - - [256, 5056, 1, 256] - - [683, 5256.19] + - [686, 5256.19] - - [128, 3584, 1, 128] - - [602, 2073.46] + - [605, 2073.46] - - [448, 3584, 1, 3328] - - [679, 6833.86] + - [682, 6833.86] - - [4, 5056, 1, 3328] - - [649, 581.423] + - [652, 581.423] - - [704, 2368, 1, 128] - - [670, 3402.19] + - [673, 3402.19] - - [5888, 256, 1, 128] - - [675, 2977.44] + - [678, 2977.44] - - [4, 5056, 1, 128] - - [701, 65.1074] + - [704, 65.1074] - - [448, 256, 1, 256] - - [687, 1764.43] + - [690, 1764.43] - - [704, 4, 1, 3328] - - [639, 398.454] + - [642, 398.454] - - [1408, 256, 1, 256] - - [682, 3463.76] + - [685, 3463.76] - - [3584, 1856, 1, 128] - - [678, 3228.09] + - [681, 3228.09] - - [4288, 4288, 1, 128] - - [674, 4853.83] + - [677, 4853.83] - - [1856, 1024, 1, 3328] - - [698, 5994.58] + - [701, 5994.58] - - [128, 5888, 1, 3328] - - [652, 6512.75] + - [655, 6512.75] - - [1024, 5056, 1, 256] - - [692, 7859.32] + - [695, 7859.32] - - [5888, 5888, 1, 1280] - - [692, 8131.34] + - [695, 8131.34] - - [5056, 5888, 1, 128] - - [671, 4920.61] + - [674, 4920.61] - - [2368, 1408, 1, 3328] - - [690, 7110.64] + - [693, 7110.64] - - [1024, 48000, 1, 1536] - - [690, 8590.72] + - [693, 8590.72] - - [5888, 448, 1, 256] - - [691, 3567.64] + - [694, 3567.64] - - [2560, 3200, 1, 2560] - - [681, 7638.21] + - [684, 7638.21] - - [5888, 6784, 1, 128] - - [671, 3910.82] + - [674, 3910.82] - - [6144, 48000, 1, 2048] - - [692, 3412.85] + - [695, 3412.85] - - [6784, 5056, 1, 1280] - - [683, 7890.12] + - [686, 7890.12] - - [5056, 704, 1, 1280] - - [687, 7664.96] + - [690, 7664.96] - - [1024, 48000, 1, 2560] - - [692, 8188.4] + - [695, 8188.4] - - [4608, 32, 1, 1536] - - [660, 2856.87] + - [663, 2856.87] - - [1024, 2368, 1, 128] - - [670, 3019.25] + - [673, 3019.25] - - [128, 704, 1, 256] - - [621, 1696.23] + - [624, 1696.23] - - [2368, 448, 1, 3328] - - [687, 5799.19] + - [690, 5799.19] - - [128, 5888, 1, 1280] - - [681, 6680.65] + - [684, 6680.65] - - [16384, 800, 1, 4096] - - [686, 6322.12] + - [689, 6322.12] - - [448, 128, 1, 1280] - - [660, 2849.39] + - [663, 2849.39] - - [6784, 4, 1, 3328] - - [639, 563.02] + - [642, 563.02] - - [5888, 5056, 1, 1280] - - [686, 8631.23] + - [689, 8631.23] - - [1024, 64, 1, 3328] - - [661, 3481.86] + - [664, 3481.86] - - [3072, 48000, 1, 1024] - - [686, 9019.39] + - [689, 9019.39] - - [64, 3584, 1, 1280] - - [623, 4327.85] + - [626, 4327.85] - - [6784, 1408, 1, 256] - - [686, 6320.49] + - [689, 6320.49] - - [3584, 5888, 1, 128] - - [673, 4406.69] + - [676, 4406.69] - - [5056, 5888, 1, 256] - - [692, 8037.03] + - [695, 8037.03] - - [2368, 1024, 1, 256] - - [684, 4936.04] + - [687, 4936.04] - - [2944, 1856, 1, 256] - - [692, 7222.22] + - [695, 7222.22] - - [1856, 6784, 1, 1280] - - [682, 8251.71] + - [685, 8251.71] - - [64, 5056, 1, 128] - - [602, 1643.6] + - [605, 1643.6] - - [64, 6784, 1, 128] - - [600, 1929.67] + - [603, 1929.67] - - [448, 704, 1, 128] - - [672, 979.859] + - [675, 979.859] - - [4, 1024, 1, 128] - - [701, 20.0416] + - [704, 20.0416] - - [4288, 3584, 1, 256] - - [686, 8444.04] + - [689, 8444.04] - - [1408, 704, 1, 128] - - [670, 3020.9] + - [673, 3020.9] - - [64, 256, 1, 3328] - - [666, 2227.37] + - [669, 2227.37] - - [6784, 448, 1, 3328] - - [692, 6573.01] + - [695, 6573.01] - - [5056, 1856, 1, 1280] - - [684, 7976.13] + - [687, 7976.13] - - [1408, 1024, 1, 3328] - - [682, 7470.23] + - [685, 7470.23] - - [2368, 256, 1, 3328] - - [687, 5394.27] + - [690, 5394.27] - - [5888, 3584, 1, 1280] - - [679, 9031.45] + - [682, 9031.45] - - [1856, 3584, 1, 3328] - - [694, 7272.5] + - [697, 7272.5] - - [5888, 128, 1, 1280] - - [687, 6684.38] + - [690, 6684.38] - - [1024, 2944, 1, 256] - - [692, 7414.99] + - [695, 7414.99] - - [448, 6784, 1, 1280] - - [688, 7923.68] + - [691, 7923.68] - - [256, 3584, 1, 1280] - - [684, 6901.77] + - [687, 6901.77] - - [704, 5056, 1, 256] - - [689, 5004.45] + - [692, 5004.45] - - [3584, 1024, 1, 3328] - - [681, 7894.53] + - [684, 7894.53] - - [2944, 1856, 1, 1280] - - [686, 7903.17] + - [689, 7903.17] - - [128, 256, 1, 128] - - [599, 325.645] + - [602, 325.645] - - [5056, 256, 1, 256] - - [683, 3356.46] + - [686, 3356.46] - - [2944, 4288, 1, 3328] - - [692, 7813.83] + - [695, 7813.83] - - [2368, 3584, 1, 3328] - - [692, 8370.99] + - [695, 8370.99] - - [2944, 704, 1, 1280] - - [698, 5513.99] + - [701, 5513.99] - - [128, 4, 1, 256] - - [639, 25.2062] + - [642, 25.2062] - - [2944, 3584, 1, 1280] - - [686, 7738.73] + - [689, 7738.73] - - [1856, 5888, 1, 1280] - - [680, 8584.53] + - [683, 8584.53] - - [256, 256, 1, 1280] - - [660, 2962.08] + - [663, 2962.08] - - [2048, 3200, 1, 2048] - - [688, 6911.59] + - [691, 6911.59] - - [4288, 1408, 1, 256] - - [686, 7953.9] + - [689, 7953.9] - - [3584, 64, 1, 256] - - [687, 2780.32] + - [690, 2780.32] - - [64, 1856, 1, 3328] - - [622, 4911.94] + - [625, 4911.94] - - [256, 1408, 1, 128] - - [670, 1373.14] + - [673, 1373.14] - - [5888, 1408, 1, 128] - - [675, 4241.91] + - [678, 4241.91] - - [4288, 2368, 1, 1280] - - [684, 8012.6] + - [687, 8012.6] - - [4, 4288, 1, 256] - - [705, 301.574] + - [708, 301.574] - - [256, 4288, 1, 128] - - [670, 2706.26] + - [673, 2706.26] - - [2048, 128, 1, 2048] - - [665, 2885.16] + - [668, 2885.16] - - [256, 128, 1, 3328] - - [667, 3170.11] + - [670, 3170.11] - - [512, 8, 1, 500000] - - [591, 1915.02] + - [594, 1915.02] - - [6784, 2368, 1, 256] - - [686, 8323.56] + - [689, 8323.56] - - [5888, 128, 1, 128] - - [674, 2465.98] + - [677, 2465.98] - - [1024, 24000, 1, 2816] - - [684, 8131.54] + - [687, 8131.54] - - [7680, 5984, 1, 2560] - - [688, 6040.67] + - [691, 6040.67] - - [4288, 1856, 1, 256] - - [700, 5818.43] + - [703, 5818.43] - - [1856, 256, 1, 3328] - - [681, 6531.93] + - [684, 6531.93] - - [1856, 2944, 1, 256] - - [686, 7312.82] + - [689, 7312.82] - - [5056, 1024, 1, 128] - - [676, 4102.9] + - [679, 4102.9] - - [64, 5888, 1, 1280] - - [681, 5058.15] + - [684, 5058.15] - - [1760, 800, 1, 1760] - - [684, 7279.9] + - [687, 7279.9] - - [6784, 256, 1, 128] - - [674, 3257.59] + - [677, 3257.59] - - [5888, 704, 1, 128] - - [670, 3813.83] + - [673, 3813.83] - - [1408, 2368, 1, 128] - - [671, 3561.17] + - [674, 3561.17] - - [1024, 4288, 1, 1280] - - [690, 7752.64] + - [693, 7752.64] - - [2368, 5056, 1, 3328] - - [693, 7711.81] + - [696, 7711.81] - - [448, 4, 1, 128] - - [701, 18.3795] + - [704, 18.3795] - - [4, 256, 1, 3328] - - [708, 269.61] + - [711, 269.61] - - [4288, 1024, 1, 3328] - - [687, 7910.17] + - [690, 7910.17] - - [6144, 48000, 1, 2560] - - [686, 3540.99] + - [689, 3540.99] - - [1024, 5056, 1, 3328] - - [680, 8509.56] + - [683, 8509.56] - - [1024, 1856, 1, 3328] - - [686, 7907.83] + - [689, 7907.83] - - [704, 704, 1, 1280] - - [698, 5648.05] + - [701, 5648.05] - - [128, 2368, 1, 1280] - - [657, 4145.01] + - [660, 4145.01] - - [1408, 128, 1, 3328] - - [630, 4919.5] + - [633, 4919.5] - - [3584, 256, 1, 1280] - - [682, 5185.46] + - [685, 5185.46] - - [4, 128, 1, 128] - - [701, 2.97891] + - [704, 2.97891] - - [5888, 64, 1, 1280] - - [630, 4499.49] + - [633, 4499.49] - - [3584, 128, 1, 1280] - - [687, 5928.91] + - [690, 5928.91] - - [4, 256, 1, 1280] - - [706, 170.667] + - [709, 170.667] - - [128, 704, 1, 3328] - - [630, 4379.27] + - [633, 4379.27] - - [4288, 6784, 1, 256] - - [680, 7180.99] + - [683, 7180.99] - - [3584, 2944, 1, 3328] - - [686, 8553.2] + - [689, 8553.2] - - [128, 1856, 1, 256] - - [687, 3207.67] + - [690, 3207.67] - - [64, 4288, 1, 256] - - [681, 2907.89] + - [684, 2907.89] - - [4, 3584, 1, 3328] - - [639, 560.505] + - [642, 560.505] - - [64, 4, 1, 3328] - - [708, 67.4025] + - [711, 67.4025] - - [4, 64, 1, 3328] - - [708, 88.7467] + - [711, 88.7467] - - [5888, 2944, 1, 256] - - [686, 7255.67] + - [689, 7255.67] - - [1856, 64, 1, 256] - - [632, 1743.62] + - [635, 1743.62] - - [5056, 128, 1, 1280] - - [687, 6009.69] + - [690, 6009.69] - - [448, 4288, 1, 1280] - - [688, 6466.72] + - [691, 6466.72] - - [448, 1856, 1, 3328] - - [688, 6381.89] + - [691, 6381.89] - - [1024, 4288, 1, 128] - - [673, 3491.77] + - [676, 3491.77] - - [4, 1024, 1, 256] - - [706, 172.463] + - [709, 172.463] - - [5056, 4288, 1, 256] - - [686, 8241.42] + - [689, 8241.42] - - [1024, 448, 1, 256] - - [690, 4218.41] + - [693, 4218.41] - - [1024, 3584, 1, 256] - - [686, 6513.59] + - [689, 6513.59] - - [2944, 128, 1, 1280] - - [630, 4710.38] + - [633, 4710.38] - - [2048, 32, 1, 2048] - - [645, 1779.13] + - [648, 1779.13] - - [64, 256, 1, 256] - - [639, 655.36] + - [642, 655.36] - - [1408, 4, 1, 128] - - [702, 20.0249] + - [705, 20.0249] - - [128, 2368, 1, 128] - - [602, 1707.63] + - [605, 1707.63] - - [256, 704, 1, 1280] - - [681, 3735.21] + - [684, 3735.21] - - [64, 2368, 1, 128] - - [609, 1049.71] + - [612, 1049.71] - - [6784, 6784, 1, 3328] - - [686, 9277.84] + - [689, 9277.84] - - [448, 5888, 1, 1280] - - [692, 7319.65] + - [695, 7319.65] - - [5056, 448, 1, 128] - - [674, 3694.33] + - [677, 3694.33] - - [4288, 704, 1, 1280] - - [684, 7890.86] + - [687, 7890.86] - - [3584, 2944, 1, 128] - - [676, 4124.61] + - [679, 4124.61] - - [6784, 256, 1, 1280] - - [692, 7185.73] + - [695, 7185.73] - - [256, 2944, 1, 1280] - - [681, 6736.66] + - [684, 6736.66] - - [64, 4288, 1, 128] - - [600, 1614.31] + - [603, 1614.31] - - [2368, 5888, 1, 3328] - - [682, 8616.36] + - [685, 8616.36] - - [4, 64, 1, 256] - - [619, 11.3778] + - [622, 11.3778] - - [704, 1024, 1, 3328] - - [687, 6801.82] + - [690, 6801.82] - - [2368, 1856, 1, 1280] - - [684, 7853.47] + - [687, 7853.47] - - [448, 5056, 1, 3328] - - [687, 7452.94] + - [690, 7452.94] - - [128, 448, 1, 128] - - [602, 530.349] + - [605, 530.349] - - [128, 6784, 1, 256] - - [682, 5557.45] + - [685, 5557.45] - - [3584, 4288, 1, 128] - - [673, 4462.63] + - [676, 4462.63] - - [64, 448, 1, 128] - - [602, 278.032] + - [605, 278.032] - - [5888, 4288, 1, 3328] - - [679, 9153.45] + - [682, 9153.45] - - [2368, 704, 1, 256] - - [686, 5350.68] + - [689, 5350.68] - - [256, 1856, 1, 3328] - - [681, 6536.25] + - [684, 6536.25] - - [1856, 128, 1, 256] - - [695, 2847.26] + - [698, 2847.26] - - [6784, 128, 1, 128] - - [675, 2530.72] + - [678, 2530.72] - - [3584, 1408, 1, 128] - - [676, 3625.52] + - [679, 3625.52] - - [1856, 5056, 1, 1280] - - [682, 8123.29] + - [685, 8123.29] - - [2944, 1024, 1, 1280] - - [692, 8450.31] + - [695, 8450.31] - - [5056, 4, 1, 256] - - [706, 380.687] + - [709, 380.687] - - [3584, 5888, 1, 3328] - - [684, 8567.89] + - [687, 8567.89] - - [2368, 4288, 1, 256] - - [688, 7857.97] + - [691, 7857.97] - - [1024, 2368, 1, 3328] - - [682, 6776.35] + - [685, 6776.35] - - [64, 704, 1, 3328] - - [637, 3503.42] + - [640, 3503.42] - - [704, 1408, 1, 256] - - [682, 6099.89] + - [685, 6099.89] - - [4096, 128, 1, 4096] - - [662, 4116.47] + - [665, 4116.47] - - [1024, 3584, 1, 1280] - - [692, 7231.55] + - [695, 7231.55] - - [4288, 5888, 1, 3328] - - [686, 8762.32] + - [689, 8762.32] - - [4288, 4, 1, 1280] - - [639, 492.697] + - [642, 492.697] - - [4608, 16, 1, 1536] - - [640, 1892.48] + - [643, 1892.48] - - [5888, 64, 1, 128] - - [617, 1747.63] + - [620, 1747.63] - - [4, 5888, 1, 128] - - [702, 84.4915] + - [705, 84.4915] - - [1024, 2944, 1, 3328] - - [690, 6906.95] + - [693, 6906.95] - - [6784, 1856, 1, 256] - - [686, 6273.97] + - [689, 6273.97] - - [2048, 64, 1, 2048] - - [669, 2371.34] + - [672, 2371.34] - - [256, 6784, 1, 1280] - - [686, 7066.94] + - [689, 7066.94] - - [1856, 3584, 1, 256] - - [692, 7706.77] + - [695, 7706.77] - - [128, 448, 1, 3328] - - [637, 3995.83] + - [640, 3995.83] - - [6784, 1856, 1, 128] - - [674, 4458.99] + - [677, 4458.99] - - [4, 448, 1, 256] - - [639, 84.3294] + - [642, 84.3294] - - [5056, 128, 1, 256] - - [687, 4954.4] + - [690, 4954.4] - - [512, 24000, 1, 2816] - - [680, 8994.88] + - [683, 8994.88] - - [256, 5888, 1, 1280] - - [679, 6183.9] + - [682, 6183.9] - - [4, 128, 1, 1280] - - [707, 71.8597] + - [710, 71.8597] - - [16384, 1600, 1, 4096] - - [686, 6920.99] + - [689, 6920.99] - - [6784, 128, 1, 1280] - - [690, 6486.27] + - [693, 6486.27] - - [64, 1408, 1, 256] - - [627, 1647.76] + - [630, 1647.76] - - [2368, 1408, 1, 128] - - [674, 3937.0] + - [677, 3937.0] - - [1856, 448, 1, 256] - - [687, 4635.47] + - [690, 4635.47] - - [1408, 1024, 1, 128] - - [670, 3208.41] + - [673, 3208.41] - - [128, 64, 1, 128] - - [599, 70.092] + - [602, 70.092] - - [6784, 3584, 1, 3328] - - [692, 8466.18] + - [695, 8466.18] - - [1760, 7000, 1, 1760] - - [690, 8149.11] + - [693, 8149.11] - - [2944, 64, 1, 3328] - - [623, 5017.99] + - [626, 5017.99] - - [64, 64, 1, 128] - - [599, 35.4249] + - [602, 35.4249] - - [2368, 5056, 1, 1280] - - [686, 8763.9] + - [689, 8763.9] - - [64, 4, 1, 1280] - - [708, 43.5745] + - [711, 43.5745] - - [1408, 2368, 1, 1280] - - [687, 7660.28] + - [690, 7660.28] - - [128, 1408, 1, 1280] - - [622, 4185.17] + - [625, 4185.17] - - [256, 64, 1, 3328] - - [647, 2071.65] + - [650, 2071.65] - - [704, 4288, 1, 128] - - [670, 4069.08] + - [673, 4069.08] - - [128, 1856, 1, 3328] - - [653, 5776.05] + - [656, 5776.05] - - [2944, 2944, 1, 256] - - [692, 7949.21] + - [695, 7949.21] - - [2944, 4, 1, 1280] - - [639, 483.118] + - [642, 483.118] - - [5888, 4, 1, 256] - - [624, 396.665] + - [627, 396.665] - - [6784, 256, 1, 256] - - [698, 4044.73] + - [701, 4044.73] - - [256, 5056, 1, 3328] - - [681, 7607.27] + - [684, 7607.27] - - [128, 4288, 1, 1280] - - [622, 4958.68] + - [625, 4958.68] - - [5056, 1856, 1, 128] - - [674, 4560.84] + - [677, 4560.84] - - [5056, 1024, 1, 3328] - - [686, 8634.08] + - [689, 8634.08] - - [128, 128, 1, 256] - - [624, 699.051] + - [627, 699.051] - - [1760, 64, 1, 1760] - - [630, 4580.55] + - [633, 4580.55] - - [4288, 3584, 1, 3328] - - [692, 9143.66] + - [695, 9143.66] - - [448, 704, 1, 3328] - - [681, 4473.33] + - [684, 4473.33] - - [448, 448, 1, 128] - - [612, 1264.28] + - [615, 1264.28] - - [1024, 2368, 1, 1280] - - [690, 7452.41] + - [693, 7452.41] - - [1856, 704, 1, 3328] - - [681, 6103.24] + - [684, 6103.24] - - [4, 2368, 1, 128] - - [701, 95.919] + - [704, 95.919] - - [5888, 6784, 1, 3328] - - [686, 9131.64] + - [689, 9131.64] - - [704, 4288, 1, 1280] - - [688, 7906.36] + - [691, 7906.36] - - [704, 256, 1, 256] - - [681, 2772.68] + - [684, 2772.68] - - [1024, 48000, 1, 2048] - - [685, 6513.35] + - [688, 6513.35] - - [4288, 1024, 1, 128] - - [670, 4291.67] + - [673, 4291.67] - - [256, 64, 1, 3136] - - [711, 3015.27] + - [714, 3015.27] - - [256, 1024, 1, 196] - - [715, 4225.35] + - [718, 4225.35] - - [1024, 1024, 1, 3328] - - [827, 8705.0] + - [830, 8705.0] - - [2048, 200, 1, 3200] - - [832, 6173.32] + - [835, 6173.32] - - [1024, 200, 1, 13312] - - [730, 5213.21] + - [733, 5213.21] - - [1024, 256, 1, 1536] - - [832, 5859.33] + - [835, 5859.33] - - [4096, 256, 1, 12288] - - [837, 8807.42] + - [840, 8807.42] - - [64, 200, 1, 1024] - - [804, 366.532] + - [807, 366.532] - - [32, 512, 1, 1024] - - [759, 452.949] + - [762, 452.949] - - [2048, 256, 1, 3328] - - [821, 7876.63] + - [824, 7876.63] - - [4096, 512, 1, 32] - - [825, 3975.64] + - [828, 3975.64] - - [2048, 256, 1, 13312] - - [802, 7837.71] + - [805, 7837.71] - - [4096, 200, 1, 11264] - - [837, 6902.66] + - [840, 6902.66] - - [2048, 512, 1, 1024] - - [831, 8100.04] + - [834, 8100.04] - - [2048, 1024, 1, 1664] - - [731, 9081.98] + - [734, 9081.98] - - [1024, 1024, 1, 64] - - [827, 4258.18] + - [830, 4258.18] - - [512, 1024, 1, 1536] - - [821, 7597.23] + - [824, 7597.23] - - [1024, 256, 1, 15360] - - [722, 6735.14] + - [725, 6735.14] - - [1, 512, 1, 1024] - - [772, 15.0657] + - [775, 15.0657] - - [4096, 512, 1, 1408] - - [734, 9024.42] + - [737, 9024.42] - - [1024, 200, 1, 1408] - - [832, 4460.99] + - [835, 4460.99] - - [1024, 512, 1, 512] - - [826, 6528.1] + - [829, 6528.1] - - [4096, 256, 1, 15360] - - [833, 8823.93] + - [836, 8823.93] - - [2048, 512, 1, 640] - - [823, 7989.15] + - [826, 7989.15] - - [4096, 1024, 1, 1280] - - [729, 9421.44] + - [732, 9421.44] - - [1024, 200, 1, 6144] - - [821, 4966.42] + - [824, 4966.42] - - [1024, 1024, 1, 512] - - [823, 7731.44] + - [826, 7731.44] - - [128, 512, 1, 2048] - - [739, 2190.24] + - [742, 2190.24] - - [2048, 1024, 1, 640] - - [729, 8581.7] + - [732, 8581.7] - - [1024, 256, 1, 3328] - - [821, 6192.61] + - [824, 6192.61] - - [4096, 1024, 1, 13312] - - [734, 9642.49] + - [737, 9642.49] - - [2048, 256, 1, 2048] - - [821, 7485.65] + - [824, 7485.65] - - [2048, 1024, 1, 13312] - - [734, 9352.16] + - [737, 9352.16] - - [2048, 512, 1, 16640] - - [822, 8839.07] + - [825, 8839.07] - - [1024, 512, 1, 128] - - [826, 4279.9] + - [829, 4279.9] - - [2048, 1024, 1, 3584] - - [729, 9264.62] + - [732, 9264.62] - - [2048, 512, 1, 256] - - [837, 6990.51] + - [840, 6990.51] - - [512, 256, 1, 3200] - - [784, 4154.42] + - [787, 4154.42] - - [4096, 1024, 1, 1920] - - [729, 9535.22] + - [732, 9535.22] - - [4096, 200, 1, 2560] - - [834, 6754.55] + - [837, 6754.55] - - [1024, 256, 1, 16384] - - [724, 6289.5] + - [727, 6289.5] - - [1024, 1024, 1, 1152] - - [827, 8407.29] + - [830, 8407.29] - - [2048, 200, 1, 32] - - [770, 1412.41] + - [773, 1412.41] - - [512, 1024, 1, 2816] - - [821, 7843.15] + - [824, 7843.15] - - [4096, 256, 1, 14336] - - [833, 8844.67] + - [836, 8844.67] - - [1024, 200, 1, 4608] - - [832, 4931.64] + - [835, 4931.64] - - [1024, 200, 1, 16384] - - [727, 5135.05] + - [730, 5135.05] - - [64, 256, 1, 1024] - - [805, 460.913] + - [808, 460.913] - - [1, 200, 1, 1024] - - [787, 7.39884] + - [790, 7.39884] - - [2048, 200, 1, 2080] - - [832, 6033.77] + - [835, 6033.77] - - [512, 256, 1, 1792] - - [742, 3153.61] + - [745, 3153.61] - - [2048, 200, 1, 1024] - - [832, 5711.2] + - [835, 5711.2] - - [4096, 1024, 1, 12288] - - [729, 9658.13] + - [732, 9658.13] - - [4096, 200, 1, 4096] - - [823, 6834.45] + - [826, 6834.45] - - [1024, 512, 1, 11264] - - [790, 7686.36] + - [793, 7686.36] - - [128, 512, 1, 1024] - - [760, 1458.89] + - [763, 1458.89] - - [32, 256, 1, 2048] - - [778, 384.799] + - [781, 384.799] - - [1024, 200, 1, 1792] - - [832, 4638.54] + - [835, 4638.54] - - [1024, 1024, 1, 1792] - - [827, 8550.46] + - [830, 8550.46] - - [32, 256, 1, 512] - - [811, 161.319] + - [814, 161.319] - - [512, 200, 1, 2816] - - [737, 3353.0] + - [740, 3353.0] - - [512, 200, 1, 3072] - - [722, 3298.79] + - [725, 3298.79] - - [1024, 1024, 1, 8192] - - [768, 8369.0] + - [771, 8369.0] - - [1024, 256, 1, 12288] - - [725, 6475.61] + - [728, 6475.61] - - [4096, 200, 1, 768] - - [827, 6367.87] + - [830, 6367.87] - - [1024, 512, 1, 16384] - - [843, 7367.02] + - [846, 7367.02] - - [4096, 256, 1, 1024] - - [823, 8214.06] + - [826, 8214.06] - - [1024, 512, 1, 256] - - [826, 5537.03] + - [829, 5537.03] - - [4096, 1024, 1, 8320] - - [729, 9674.16] + - [732, 9674.16] - - [4096, 256, 1, 9216] - - [831, 8790.92] + - [834, 8790.92] - - [1024, 512, 1, 1408] - - [821, 7459.55] + - [824, 7459.55] - - [1024, 512, 1, 5632] - - [832, 7997.81] + - [835, 7997.81] - - [4096, 200, 1, 256] - - [837, 5371.8] + - [840, 5371.8] - - [1024, 200, 1, 128] - - [815, 1998.05] + - [818, 1998.05] - - [256, 200, 1, 1024] - - [784, 1195.91] + - [787, 1195.91] - - [1024, 200, 1, 5120] - - [832, 4957.34] + - [835, 4957.34] - - [512, 1024, 1, 3072] - - [845, 7103.97] + - [848, 7103.97] - - [4096, 1024, 1, 15360] - - [729, 9668.94] + - [732, 9668.94] - - [1, 256, 1, 2048] - - [771, 13.8262] + - [774, 13.8262] - - [1024, 1024, 1, 4160] - - [823, 8759.2] + - [826, 8759.2] - - [1024, 256, 1, 256] - - [830, 3728.27] + - [833, 3728.27] - - [2048, 256, 1, 384] - - [832, 6123.07] + - [835, 6123.07] - - [512, 256, 1, 2560] - - [786, 3809.54] + - [789, 3809.54] - - [4096, 512, 1, 3072] - - [734, 9215.09] + - [737, 9215.09] - - [1024, 256, 1, 4160] - - [821, 6293.39] + - [824, 6293.39] - - [4096, 512, 1, 13312] - - [731, 9367.22] + - [734, 9367.22] - - [4096, 1024, 1, 3840] - - [729, 9631.47] + - [732, 9631.47] - - [4096, 200, 1, 640] - - [827, 6206.06] + - [830, 6206.06] - - [32, 200, 1, 2048] - - [765, 303.407] + - [768, 303.407] - - [1024, 200, 1, 512] - - [821, 3713.09] + - [824, 3713.09] - - [1024, 1024, 1, 7168] - - [824, 8475.64] + - [827, 8475.64] - - [2048, 1024, 1, 3200] - - [729, 9271.24] + - [732, 9271.24] - - [512, 512, 1, 1536] - - [832, 5832.17] + - [835, 5832.17] - - [4096, 256, 1, 768] - - [837, 8065.97] + - [840, 8065.97] - - [2048, 256, 1, 6656] - - [821, 8034.77] + - [824, 8034.77] - - [1024, 256, 1, 896] - - [821, 5467.44] + - [824, 5467.44] - - [2048, 256, 1, 512] - - [832, 6465.21] + - [835, 6465.21] - - [2048, 200, 1, 3072] - - [832, 6165.68] + - [835, 6165.68] - - [128, 200, 1, 1024] - - [789, 692.77] + - [792, 692.77] - - [4096, 512, 1, 3840] - - [734, 9272.6] + - [737, 9272.6] - - [1024, 200, 1, 3200] - - [832, 4838.75] + - [835, 4838.75] - - [4096, 512, 1, 5632] - - [729, 9335.42] + - [732, 9335.42] - - [4096, 512, 1, 64] - - [764, 5275.85] + - [767, 5275.85] - - [1024, 512, 1, 2816] - - [821, 7816.58] + - [824, 7816.58] - - [4096, 256, 1, 7680] - - [827, 8795.4] + - [830, 8795.4] - - [4096, 200, 1, 1024] - - [837, 6448.81] + - [840, 6448.81] - - [1024, 512, 1, 12288] - - [791, 7624.57] + - [794, 7624.57] - - [2048, 1024, 1, 512] - - [734, 8436.06] + - [737, 8436.06] - - [128, 256, 1, 2048] - - [808, 1342.18] + - [811, 1342.18] - - [2048, 200, 1, 1792] - - [832, 6020.37] + - [835, 6020.37] - - [1024, 1024, 1, 2816] - - [823, 8670.4] + - [826, 8670.4] - - [2048, 512, 1, 1536] - - [834, 8466.22] + - [837, 8466.22] - - [4096, 256, 1, 3072] - - [831, 8631.37] + - [834, 8631.37] - - [1024, 200, 1, 1536] - - [813, 4577.6] + - [816, 4577.6] - - [1024, 256, 1, 1024] - - [821, 5491.72] + - [824, 5491.72] - - [4096, 512, 1, 8192] - - [734, 9325.54] + - [737, 9325.54] - - [128, 1024, 1, 512] - - [832, 2534.32] + - [835, 2534.32] - - [4096, 512, 1, 2304] - - [729, 9192.99] + - [732, 9192.99] - - [2048, 256, 1, 5632] - - [832, 7999.54] + - [835, 7999.54] - - [1024, 256, 1, 5120] - - [832, 6307.22] + - [835, 6307.22] - - [1024, 512, 1, 6656] - - [832, 8028.85] + - [835, 8028.85] - - [4096, 512, 1, 2816] - - [729, 9234.4] + - [732, 9234.4] - - [4096, 200, 1, 2080] - - [816, 6697.86] + - [819, 6697.86] - - [1024, 200, 1, 2304] - - [832, 4752.81] + - [835, 4752.81] - - [2048, 200, 1, 13312] - - [821, 6346.13] + - [824, 6346.13] - - [64, 1024, 1, 1024] - - [805, 1359.58] + - [808, 1359.58] - - [4096, 256, 1, 3584] - - [827, 8668.8] + - [830, 8668.8] - - [2048, 1024, 1, 7680] - - [729, 9365.78] + - [732, 9365.78] - - [1024, 256, 1, 1664] - - [821, 5907.47] + - [824, 5907.47] - - [1, 512, 1, 2048] - - [748, 23.4057] + - [751, 23.4057] - - [512, 512, 1, 1024] - - [821, 5360.13] + - [824, 5360.13] - - [2048, 256, 1, 8192] - - [793, 7665.21] + - [796, 7665.21] - - [2048, 512, 1, 512] - - [823, 7767.23] + - [826, 7767.23] - - [4096, 512, 1, 1920] - - [729, 9132.94] + - [732, 9132.94] - - [4096, 200, 1, 12288] - - [837, 6910.65] + - [840, 6910.65] - - [1024, 512, 1, 3072] - - [767, 7310.33] + - [770, 7310.33] - - [2048, 512, 1, 1152] - - [827, 8342.26] + - [830, 8342.26] - - [1024, 256, 1, 2080] - - [821, 6010.36] + - [824, 6010.36] - - [4096, 1024, 1, 32] - - [817, 4793.49] + - [820, 4793.49] - - [4096, 512, 1, 16640] - - [729, 9365.31] + - [732, 9365.31] - - [2048, 200, 1, 9216] - - [821, 6315.88] + - [824, 6315.88] - - [2048, 200, 1, 2560] - - [821, 6119.14] + - [824, 6119.14] - - [2048, 1024, 1, 1024] - - [729, 8628.59] + - [732, 8628.59] - - [2048, 256, 1, 4608] - - [821, 7951.29] + - [824, 7951.29] - - [512, 200, 1, 768] - - [773, 2132.41] + - [776, 2132.41] - - [128, 256, 1, 512] - - [773, 670.017] + - [776, 670.017] - - [4096, 512, 1, 1792] - - [734, 9126.91] + - [737, 9126.91] - - [4096, 1024, 1, 8192] - - [729, 9591.27] + - [732, 9591.27] - - [1024, 256, 1, 2816] - - [832, 6119.01] + - [835, 6119.01] - - [1024, 1024, 1, 13312] - - [824, 8529.27] + - [827, 8529.27] - - [2048, 1024, 1, 4160] - - [729, 9305.57] + - [732, 9305.57] - - [2048, 256, 1, 3584] - - [821, 7903.13] + - [824, 7903.13] - - [128, 200, 1, 2048] - - [789, 1135.81] + - [792, 1135.81] - - [4096, 512, 1, 10240] - - [731, 9339.49] + - [734, 9339.49] - - [4096, 512, 1, 512] - - [729, 8446.68] + - [732, 8446.68] - - [2048, 1024, 1, 6656] - - [729, 9331.65] + - [732, 9331.65] - - [1024, 512, 1, 640] - - [821, 6775.94] + - [824, 6775.94] - - [2048, 512, 1, 768] - - [823, 8085.41] + - [826, 8085.41] - - [2048, 200, 1, 1408] - - [821, 5880.07] + - [824, 5880.07] - - [4096, 200, 1, 2048] - - [837, 6691.61] + - [840, 6691.61] - - [1024, 1024, 1, 5632] - - [823, 8749.53] + - [826, 8749.53] - - [2048, 512, 1, 3584] - - [827, 8704.13] + - [830, 8704.13] - - [64, 512, 1, 512] - - [763, 667.883] + - [766, 667.883] - - [64, 200, 1, 512] - - [773, 251.288] + - [776, 251.288] - - [1024, 200, 1, 64] - - [728, 1310.72] + - [731, 1310.72] - - [512, 512, 1, 2304] - - [821, 6078.7] + - [824, 6078.7] - - [2048, 1024, 1, 14336] - - [729, 9321.84] + - [732, 9321.84] - - [4096, 512, 1, 11264] - - [731, 9339.85] + - [734, 9339.85] - - [4096, 512, 1, 128] - - [816, 6566.43] + - [819, 6566.43] - - [1024, 512, 1, 64] - - [836, 2953.74] + - [839, 2953.74] - - [4096, 512, 1, 768] - - [729, 8738.13] + - [732, 8738.13] - - [4096, 1024, 1, 11264] - - [729, 9637.68] + - [732, 9637.68] - - [1, 256, 1, 1024] - - [819, 8.83234] + - [822, 8.83234] - - [4096, 200, 1, 7680] - - [816, 6889.47] + - [819, 6889.47] - - [1024, 200, 1, 12288] - - [788, 5237.64] + - [791, 5237.64] - - [1024, 1024, 1, 1280] - - [823, 8418.07] + - [826, 8418.07] - - [4096, 1024, 1, 16640] - - [729, 9674.91] + - [732, 9674.91] - - [2048, 1024, 1, 5632] - - [729, 9327.75] + - [732, 9327.75] - - [1024, 200, 1, 15360] - - [788, 5386.53] + - [791, 5386.53] - - [1, 1024, 1, 1024] - - [838, 27.2499] + - [841, 27.2499] - - [2048, 256, 1, 16384] - - [799, 7652.65] + - [802, 7652.65] - - [4096, 512, 1, 12288] - - [731, 9359.41] + - [734, 9359.41] - - [2048, 200, 1, 896] - - [832, 5628.86] + - [835, 5628.86] - - [4096, 1024, 1, 5632] - - [729, 9626.68] + - [732, 9626.68] - - [2048, 256, 1, 32] - - [825, 1889.33] + - [828, 1889.33] - - [2048, 256, 1, 1280] - - [821, 7390.84] + - [824, 7390.84] - - [4096, 256, 1, 4096] - - [823, 8694.27] + - [826, 8694.27] - - [2048, 256, 1, 11264] - - [821, 8113.85] + - [824, 8113.85] - - [4096, 200, 1, 9216] - - [823, 6890.98] + - [826, 6890.98] - - [1024, 512, 1, 4096] - - [769, 7348.36] + - [772, 7348.36] - - [2048, 1024, 1, 10240] - - [731, 9095.81] + - [734, 9095.81] - - [4096, 1024, 1, 640] - - [729, 9115.58] + - [732, 9115.58] - - [128, 1024, 1, 2048] - - [722, 3270.41] + - [725, 3270.41] - - [4096, 200, 1, 3840] - - [816, 6836.16] + - [819, 6836.16] - - [1024, 1024, 1, 1920] - - [827, 8562.72] + - [830, 8562.72] - - [2048, 200, 1, 7168] - - [832, 6296.13] + - [835, 6296.13] - - [2048, 512, 1, 16384] - - [723, 8632.41] + - [726, 8632.41] - - [2048, 1024, 1, 12288] - - [729, 9157.98] + - [732, 9157.98] - - [4096, 1024, 1, 10240] - - [729, 9658.74] + - [732, 9658.74] - - [1024, 1024, 1, 8320] - - [831, 8799.48] + - [834, 8799.48] - - [1024, 256, 1, 9216] - - [821, 6375.13] + - [824, 6375.13] - - [4096, 256, 1, 1152] - - [816, 8300.99] + - [819, 8300.99] - - [512, 200, 1, 2560] - - [782, 3088.41] + - [785, 3088.41] - - [2048, 256, 1, 1920] - - [821, 7714.84] + - [824, 7714.84] - - [2048, 1024, 1, 4608] - - [729, 9305.6] + - [732, 9305.6] - - [512, 256, 1, 1024] - - [829, 2887.64] + - [832, 2887.64] - - [1024, 256, 1, 1920] - - [813, 5913.02] + - [816, 5913.02] - - [4096, 512, 1, 3584] - - [729, 9275.59] + - [732, 9275.59] - - [2048, 512, 1, 4160] - - [834, 8733.93] + - [837, 8733.93] - - [2048, 512, 1, 5632] - - [837, 8758.88] + - [840, 8758.88] - - [4096, 1024, 1, 4608] - - [729, 9657.12] + - [732, 9657.12] - - [4096, 1024, 1, 3328] - - [729, 9621.35] + - [732, 9621.35] - - [4096, 256, 1, 7168] - - [823, 8769.95] + - [826, 8769.95] - - [4096, 200, 1, 128] - - [837, 4458.23] + - [840, 4458.23] - - [2048, 200, 1, 5120] - - [821, 6176.81] + - [824, 6176.81] - - [1024, 1024, 1, 6656] - - [823, 8780.35] + - [826, 8780.35] - - [512, 1024, 1, 3200] - - [832, 7886.99] + - [835, 7886.99] - - [512, 200, 1, 2304] - - [722, 2990.99] + - [725, 2990.99] - - [2048, 1024, 1, 9216] - - [734, 9325.36] + - [737, 9325.36] - - [2048, 256, 1, 1536] - - [832, 7551.63] + - [835, 7551.63] - - [4096, 256, 1, 256] - - [837, 6932.73] + - [840, 6932.73] - - [2048, 512, 1, 1408] - - [834, 8430.76] + - [837, 8430.76] - - [1024, 256, 1, 384] - - [826, 4462.03] + - [829, 4462.03] - - [2048, 1024, 1, 2304] - - [729, 9174.84] + - [732, 9174.84] - - [4096, 512, 1, 6144] - - [731, 9284.15] + - [734, 9284.15] - - [1024, 200, 1, 14336] - - [720, 5268.47] + - [723, 5268.47] - - [1024, 512, 1, 2080] - - [832, 7736.37] + - [835, 7736.37] - - [2048, 512, 1, 2304] - - [834, 8615.97] + - [837, 8615.97] - - [4096, 512, 1, 15360] - - [734, 9362.07] + - [737, 9362.07] - - [1024, 256, 1, 32] - - [754, 1028.02] + - [757, 1028.02] - - [1024, 200, 1, 2816] - - [832, 4780.48] + - [835, 4780.48] - - [4096, 200, 1, 512] - - [823, 6054.13] + - [826, 6054.13] - - [4096, 1024, 1, 7168] - - [734, 9468.39] + - [737, 9468.39] - - [2048, 256, 1, 14336] - - [795, 7865.42] + - [798, 7865.42] - - [1024, 200, 1, 3072] - - [832, 4804.1] + - [835, 4804.1] - - [2048, 200, 1, 1280] - - [832, 5846.21] + - [835, 5846.21] - - [1024, 1024, 1, 2304] - - [823, 8633.22] + - [826, 8633.22] - - [4096, 1024, 1, 9216] - - [729, 9640.93] + - [732, 9640.93] - - [2048, 512, 1, 4608] - - [834, 8743.2] + - [837, 8743.2] - - [4096, 1024, 1, 7680] - - [729, 9684.76] + - [732, 9684.76] - - [4096, 256, 1, 6144] - - [834, 8757.14] + - [837, 8757.14] - - [4096, 256, 1, 896] - - [827, 8258.83] + - [830, 8258.83] - - [512, 256, 1, 1536] - - [811, 3065.26] + - [814, 3065.26] - - [1024, 256, 1, 512] - - [821, 4752.75] + - [824, 4752.75] - - [2048, 256, 1, 640] - - [821, 6775.94] + - [824, 6775.94] - - [256, 256, 1, 2048] - - [758, 2248.96] + - [761, 2248.96] - - [2048, 1024, 1, 8192] - - [729, 9178.07] + - [732, 9178.07] - - [4096, 200, 1, 16640] - - [721, 7009.49] + - [724, 7009.49] - - [256, 512, 1, 512] - - [733, 2511.56] + - [736, 2511.56] - - [2048, 512, 1, 384] - - [834, 7467.6] + - [837, 7467.6] - - [2048, 200, 1, 16384] - - [802, 6327.21] + - [805, 6327.21] - - [4096, 200, 1, 10240] - - [827, 6892.64] + - [830, 6892.64] - - [1024, 512, 1, 9216] - - [776, 7529.99] + - [779, 7529.99] - - [4096, 1024, 1, 64] - - [751, 6260.16] + - [754, 6260.16] - - [4096, 200, 1, 1920] - - [837, 6710.17] + - [840, 6710.17] - - [2048, 1024, 1, 1280] - - [729, 8998.24] + - [732, 8998.24] - - [1024, 200, 1, 3840] - - [821, 4873.77] + - [824, 4873.77] - - [256, 1024, 1, 512] - - [832, 4766.25] + - [835, 4766.25] - - [2048, 1024, 1, 3328] - - [729, 9275.1] + - [732, 9275.1] - - [1024, 256, 1, 16640] - - [786, 6837.12] + - [789, 6837.12] - - [4096, 512, 1, 14336] - - [734, 9354.32] + - [737, 9354.32] - - [1024, 1024, 1, 16640] - - [831, 8832.27] + - [834, 8832.27] - - [1024, 256, 1, 1152] - - [832, 5642.56] + - [835, 5642.56] - - [512, 512, 1, 512] - - [821, 4779.83] + - [824, 4779.83] - - [4096, 512, 1, 8320] - - [734, 9327.86] + - [737, 9327.86] - - [2048, 512, 1, 7680] - - [837, 8793.86] + - [840, 8793.86] - - [4096, 1024, 1, 6656] - - [729, 9666.93] + - [732, 9666.93] - - [1024, 512, 1, 3584] - - [832, 7900.47] + - [835, 7900.47] - - [1024, 1024, 1, 32] - - [817, 2974.68] + - [820, 2974.68] - - [512, 512, 1, 2816] - - [813, 6155.75] + - [816, 6155.75] - - [2048, 512, 1, 1664] - - [837, 8496.45] + - [840, 8496.45] - - [1024, 1024, 1, 14336] - - [723, 8624.64] + - [726, 8624.64] - - [2048, 200, 1, 2048] - - [832, 6029.76] + - [835, 6029.76] - - [1024, 1024, 1, 3584] - - [823, 8702.52] + - [826, 8702.52] - - [512, 200, 1, 1280] - - [737, 2350.65] + - [740, 2350.65] - - [4096, 256, 1, 6656] - - [837, 8788.31] + - [840, 8788.31] - - [4096, 256, 1, 4160] - - [814, 8728.34] + - [817, 8728.34] - - [128, 256, 1, 1024] - - [796, 859.489] + - [799, 859.489] - - [512, 200, 1, 3200] - - [737, 3376.75] + - [740, 3376.75] - - [2048, 512, 1, 9216] - - [820, 8806.3] + - [823, 8806.3] - - [2048, 1024, 1, 256] - - [816, 7713.66] + - [819, 7713.66] - - [1024, 256, 1, 2304] - - [832, 6015.73] + - [835, 6015.73] - - [1024, 200, 1, 8192] - - [832, 5021.92] + - [835, 5021.92] - - [2048, 256, 1, 3072] - - [749, 7514.99] + - [752, 7514.99] - - [2048, 256, 1, 8320] - - [821, 8063.58] + - [824, 8063.58] - - [4096, 512, 1, 1024] - - [731, 8824.31] + - [734, 8824.31] - - [1024, 512, 1, 3200] - - [821, 7866.29] + - [824, 7866.29] - - [1024, 512, 1, 896] - - [813, 7161.01] + - [816, 7161.01] - - [2048, 512, 1, 1280] - - [827, 8384.42] + - [830, 8384.42] - - [4096, 200, 1, 64] - - [736, 3260.5] + - [739, 3260.5] - - [1024, 256, 1, 6144] - - [842, 6143.62] + - [845, 6143.62] - - [1024, 200, 1, 2560] - - [821, 4762.79] + - [824, 4762.79] - - [1024, 1024, 1, 5120] - - [750, 8454.13] + - [753, 8454.13] - - [2048, 512, 1, 6656] - - [827, 8798.95] + - [830, 8798.95] - - [4096, 1024, 1, 1536] - - [729, 9503.27] + - [732, 9503.27] - - [1024, 1024, 1, 128] - - [752, 5825.42] + - [755, 5825.42] - - [512, 1024, 1, 1792] - - [821, 7701.02] + - [824, 7701.02] - - [2048, 1024, 1, 32] - - [732, 3938.31] + - [735, 3938.31] - - [4096, 256, 1, 2816] - - [816, 8652.1] + - [819, 8652.1] - - [1024, 1024, 1, 15360] - - [723, 8719.6] + - [726, 8719.6] - - [1024, 256, 1, 5632] - - [821, 6344.08] + - [824, 6344.08] - - [1024, 1024, 1, 4096] - - [824, 8187.76] + - [827, 8187.76] - - [2048, 200, 1, 4160] - - [832, 6222.38] + - [835, 6222.38] - - [512, 256, 1, 768] - - [763, 2771.57] + - [766, 2771.57] - - [4096, 512, 1, 640] - - [734, 8590.48] + - [737, 8590.48] - - [2048, 512, 1, 8192] - - [776, 8494.8] + - [779, 8494.8] - - [1024, 512, 1, 768] - - [821, 7049.25] + - [824, 7049.25] - - [4096, 200, 1, 8320] - - [816, 6908.6] + - [819, 6908.6] - - [2048, 512, 1, 896] - - [823, 8224.13] + - [826, 8224.13] - - [4096, 200, 1, 7168] - - [834, 6878.49] + - [837, 6878.49] - - [2048, 512, 1, 13312] - - [822, 8802.94] + - [825, 8802.94] - - [64, 512, 1, 1024] - - [726, 843.924] + - [729, 843.924] - - [2048, 200, 1, 3840] - - [821, 6192.38] + - [824, 6192.38] - - [1024, 1024, 1, 768] - - [814, 8098.41] + - [817, 8098.41] - - [4096, 512, 1, 16384] - - [734, 9345.63] + - [737, 9345.63] - - [4096, 256, 1, 2304] - - [814, 8596.35] + - [817, 8596.35] - - [1, 256, 1, 4096] - - [819, 19.8293] + - [822, 19.8293] - - [1024, 1024, 1, 11264] - - [824, 8491.38] + - [827, 8491.38] - - [2048, 200, 1, 16640] - - [818, 6510.54] + - [821, 6510.54] - - [1024, 256, 1, 3072] - - [832, 6179.45] + - [835, 6179.45] - - [4096, 1024, 1, 512] - - [729, 9032.15] + - [732, 9032.15] - - [2048, 256, 1, 2816] - - [821, 7793.47] + - [824, 7793.47] - - [32, 512, 1, 512] - - [733, 318.716] + - [736, 318.716] - - [256, 512, 1, 2048] - - [784, 3368.92] + - [787, 3368.92] - - [1024, 512, 1, 384] - - [832, 6198.48] + - [835, 6198.48] - - [2048, 200, 1, 7680] - - [821, 6307.6] + - [824, 6307.6] - - [1024, 512, 1, 4608] - - [832, 7953.38] + - [835, 7953.38] - - [4096, 200, 1, 32] - - [781, 2199.19] + - [784, 2199.19] - - [4096, 200, 1, 3328] - - [816, 6813.02] + - [819, 6813.02] - - [1024, 200, 1, 1152] - - [821, 4375.55] + - [824, 4375.55] - - [1024, 1024, 1, 1408] - - [823, 8457.81] + - [826, 8457.81] - - [2048, 200, 1, 15360] - - [797, 6333.0] + - [800, 6333.0] - - [512, 1024, 1, 2048] - - [807, 6280.66] + - [810, 6280.66] - - [1024, 512, 1, 1024] - - [832, 7064.09] + - [835, 7064.09] - - [1024, 200, 1, 10240] - - [821, 5030.59] + - [824, 5030.59] - - [4096, 256, 1, 5632] - - [834, 8765.12] + - [837, 8765.12] - - [512, 512, 1, 3072] - - [844, 5942.34] + - [847, 5942.34] - - [2048, 256, 1, 1408] - - [821, 7544.95] + - [824, 7544.95] - - [2048, 256, 1, 6144] - - [832, 7963.87] + - [835, 7963.87] - - [4096, 256, 1, 3328] - - [827, 8682.48] + - [830, 8682.48] - - [1024, 200, 1, 1664] - - [821, 4595.3] + - [824, 4595.3] - - [2048, 1024, 1, 1152] - - [729, 8942.55] + - [732, 8942.55] - - [2048, 512, 1, 6144] - - [822, 8729.61] + - [825, 8729.61] - - [2048, 512, 1, 3200] - - [823, 8696.46] + - [826, 8696.46] - - [4096, 1024, 1, 2080] - - [762, 9538.35] + - [765, 9538.35] - - [4096, 1024, 1, 768] - - [729, 9260.65] + - [732, 9260.65] - - [4096, 1024, 1, 2560] - - [729, 9567.17] + - [732, 9567.17] - - [64, 200, 1, 2048] - - [761, 583.061] + - [764, 583.061] - - [2048, 200, 1, 4608] - - [832, 6243.18] + - [835, 6243.18] - - [1024, 1024, 1, 6144] - - [824, 8320.15] + - [827, 8320.15] - - [4096, 256, 1, 1664] - - [827, 8503.07] + - [830, 8503.07] - - [2048, 200, 1, 384] - - [832, 4939.9] + - [835, 4939.9] - - [1, 200, 1, 2048] - - [778, 11.2281] + - [781, 11.2281] - - [4096, 256, 1, 1792] - - [837, 8504.02] + - [840, 8504.02] - - [2048, 1024, 1, 64] - - [751, 5309.25] + - [754, 5309.25] - - [4096, 1024, 1, 16384] - - [718, 9428.51] + - [721, 9428.51] - - [1024, 512, 1, 16640] - - [832, 8122.45] + - [835, 8122.45] - - [2048, 512, 1, 10240] - - [822, 8766.11] + - [825, 8766.11] - - [4096, 512, 1, 6656] - - [729, 9351.65] + - [732, 9351.65] - - [2048, 256, 1, 16640] - - [821, 8135.17] + - [824, 8135.17] - - [2048, 512, 1, 2816] - - [823, 8660.22] + - [826, 8660.22] - - [1024, 200, 1, 32] - - [741, 780.191] + - [744, 780.191] - - [1, 512, 1, 4096] - - [766, 34.7671] + - [769, 34.7671] - - [256, 256, 1, 1024] - - [773, 1489.98] + - [776, 1489.98] - - [2048, 1024, 1, 128] - - [746, 6605.2] + - [749, 6605.2] - - [2048, 1024, 1, 2080] - - [729, 9159.41] + - [732, 9159.41] - - [2048, 1024, 1, 16640] - - [729, 9371.55] + - [732, 9371.55] - - [1024, 200, 1, 384] - - [832, 3378.14] + - [835, 3378.14] - - [4096, 256, 1, 384] - - [777, 7369.2] + - [780, 7369.2] - - [4096, 256, 1, 13312] - - [831, 8776.38] + - [834, 8776.38] - - [2048, 256, 1, 128] - - [826, 4279.9] + - [829, 4279.9] - - [512, 256, 1, 2304] - - [738, 3584.88] + - [741, 3584.88] - - [2048, 1024, 1, 3072] - - [731, 9156.42] + - [734, 9156.42] - - [1024, 1024, 1, 640] - - [827, 7928.74] + - [830, 7928.74] - - [256, 512, 1, 1024] - - [832, 2843.6] + - [835, 2843.6] - - [4096, 1024, 1, 1408] - - [729, 9437.46] + - [732, 9437.46] - - [4096, 200, 1, 5632] - - [834, 6873.86] + - [837, 6873.86] - - [4096, 1024, 1, 2048] - - [729, 9437.0] + - [732, 9437.0] - - [2048, 1024, 1, 2560] - - [734, 9195.52] + - [737, 9195.52] - - [4096, 1024, 1, 128] - - [816, 7407.16] + - [819, 7407.16] - - [1024, 200, 1, 3328] - - [832, 4857.29] + - [835, 4857.29] - - [2048, 200, 1, 1152] - - [821, 5760.0] + - [824, 5760.0] - - [1024, 200, 1, 9216] - - [720, 5053.11] + - [723, 5053.11] - - [4096, 256, 1, 512] - - [814, 7617.35] + - [817, 7617.35] - - [4096, 1024, 1, 14336] - - [729, 9665.02] + - [732, 9665.02] - - [1024, 1024, 1, 384] - - [752, 7478.7] + - [755, 7478.7] - - [2048, 200, 1, 512] - - [821, 5150.18] + - [824, 5150.18] - - [2048, 256, 1, 9216] - - [800, 7717.61] + - [803, 7717.61] - - [2048, 256, 1, 1792] - - [821, 7655.84] + - [824, 7655.84] - - [4096, 512, 1, 9216] - - [731, 9331.12] + - [734, 9331.12] - - [4096, 200, 1, 15360] - - [721, 6958.04] + - [724, 6958.04] - - [1024, 512, 1, 2048] - - [820, 7067.81] + - [823, 7067.81] - - [64, 256, 1, 2048] - - [745, 723.156] + - [748, 723.156] - - [4096, 200, 1, 1792] - - [823, 6699.55] + - [826, 6699.55] - - [1, 200, 1, 4096] - - [755, 15.5387] + - [758, 15.5387] - - [2048, 1024, 1, 2048] - - [734, 9071.83] + - [737, 9071.83] - - [1024, 200, 1, 2080] - - [813, 4679.09] + - [816, 4679.09] - - [2048, 200, 1, 1536] - - [832, 5939.82] + - [835, 5939.82] - - [1024, 1024, 1, 3072] - - [794, 8333.05] + - [797, 8333.05] - - [512, 200, 1, 1792] - - [719, 2679.63] + - [722, 2679.63] - - [1024, 256, 1, 11264] - - [722, 6470.88] + - [725, 6470.88] - - [2048, 512, 1, 12288] - - [769, 8729.14] + - [772, 8729.14] - - [1024, 256, 1, 1792] - - [832, 5931.34] + - [835, 5931.34] - - [1024, 200, 1, 7168] - - [832, 4970.23] + - [835, 4970.23] - - [32, 256, 1, 1024] - - [743, 237.234] + - [746, 237.234] - - [512, 256, 1, 3072] - - [786, 3813.0] + - [789, 3813.0] - - [1024, 1024, 1, 2080] - - [823, 8600.31] + - [826, 8600.31] - - [2048, 200, 1, 2304] - - [832, 6093.22] + - [835, 6093.22] - - [4096, 512, 1, 1536] - - [729, 9074.9] + - [732, 9074.9] - - [2048, 256, 1, 7168] - - [832, 7895.16] + - [835, 7895.16] - - [2048, 512, 1, 1792] - - [834, 8531.82] + - [837, 8531.82] - - [1024, 200, 1, 2048] - - [821, 4685.33] + - [824, 4685.33] - - [1024, 1024, 1, 4608] - - [827, 8735.61] + - [830, 8735.61] - - [4096, 256, 1, 8192] - - [823, 8782.45] + - [826, 8782.45] - - [512, 1024, 1, 1280] - - [813, 7483.15] + - [816, 7483.15] - - [2048, 1024, 1, 16384] - - [723, 8878.86] + - [726, 8878.86] - - [512, 512, 1, 1280] - - [821, 5745.62] + - [824, 5745.62] - - [1024, 200, 1, 1280] - - [813, 4446.13] + - [816, 4446.13] - - [2048, 256, 1, 3200] - - [821, 7842.75] + - [824, 7842.75] - - [2048, 512, 1, 15360] - - [769, 8757.14] + - [772, 8757.14] - - [1024, 512, 1, 3328] - - [821, 7853.94] + - [824, 7853.94] - - [1024, 512, 1, 4160] - - [821, 7934.51] + - [824, 7934.51] - - [4096, 200, 1, 6656] - - [823, 6883.2] + - [826, 6883.2] - - [4096, 1024, 1, 1024] - - [729, 9229.34] + - [732, 9229.34] - - [2048, 200, 1, 3328] - - [832, 6182.64] + - [835, 6182.64] - - [1024, 1024, 1, 256] - - [752, 6932.73] + - [755, 6932.73] - - [512, 200, 1, 512] - - [773, 1910.67] + - [776, 1910.67] - - [2048, 256, 1, 64] - - [744, 2912.71] + - [747, 2912.71] - - [1024, 256, 1, 2560] - - [821, 6123.07] + - [824, 6123.07] - - [2048, 512, 1, 11264] - - [833, 8728.84] + - [836, 8728.84] - - [32, 200, 1, 1024] - - [828, 187.46] + - [831, 187.46] - - [32, 512, 1, 2048] - - [772, 694.421] + - [775, 694.421] - - [2048, 256, 1, 2304] - - [821, 7759.25] + - [824, 7759.25] - - [2048, 256, 1, 12288] - - [800, 7726.25] + - [803, 7726.25] - - [4096, 200, 1, 8192] - - [823, 6870.84] + - [826, 6870.84] - - [1024, 512, 1, 7168] - - [769, 7479.1] + - [772, 7479.1] - - [1024, 512, 1, 1792] - - [821, 7626.01] + - [824, 7626.01] - - [4096, 1024, 1, 1664] - - [729, 9503.44] + - [732, 9503.44] - - [4096, 200, 1, 2816] - - [816, 6775.34] + - [819, 6775.34] - - [1024, 1024, 1, 896] - - [823, 8229.89] + - [826, 8229.89] - - [1024, 200, 1, 8320] - - [784, 5173.48] + - [787, 5173.48] - - [1024, 1024, 1, 12288] - - [824, 8463.11] + - [827, 8463.11] - - [1024, 256, 1, 8320] - - [813, 6404.27] + - [816, 6404.27] - - [1024, 200, 1, 1024] - - [821, 4297.44] + - [824, 4297.44] - - [1024, 200, 1, 16640] - - [783, 5499.41] + - [786, 5499.41] - - [4096, 256, 1, 5120] - - [837, 8729.05] + - [840, 8729.05] - - [1024, 256, 1, 3200] - - [832, 6124.86] + - [835, 6124.86] - - [512, 512, 1, 2560] - - [832, 6109.69] + - [835, 6109.69] - - [4096, 256, 1, 2048] - - [837, 8510.95] + - [840, 8510.95] - - [1024, 256, 1, 640] - - [821, 5102.56] + - [824, 5102.56] - - [2048, 256, 1, 5120] - - [749, 7667.83] + - [752, 7667.83] - - [2048, 256, 1, 7680] - - [832, 8054.35] + - [835, 8054.35] - - [4096, 512, 1, 384] - - [827, 8190.67] + - [830, 8190.67] - - [2048, 200, 1, 3584] - - [821, 6166.02] + - [824, 6166.02] - - [1024, 512, 1, 1536] - - [821, 7517.8] + - [824, 7517.8] - - [4096, 512, 1, 3328] - - [729, 9259.35] + - [732, 9259.35] - - [4096, 1024, 1, 256] - - [729, 8341.69] + - [732, 8341.69] - - [2048, 200, 1, 64] - - [792, 2307.61] + - [795, 2307.61] - - [2048, 200, 1, 4096] - - [832, 6211.94] + - [835, 6211.94] - - [1024, 1024, 1, 1536] - - [823, 8484.05] + - [826, 8484.05] - - [2048, 1024, 1, 7168] - - [731, 9315.14] + - [734, 9315.14] - - [1024, 256, 1, 3584] - - [821, 6207.22] + - [824, 6207.22] - - [4096, 256, 1, 32] - - [825, 2892.62] + - [828, 2892.62] - - [4096, 256, 1, 1280] - - [834, 8392.8] + - [837, 8392.8] - - [512, 512, 1, 3200] - - [832, 6219.31] + - [835, 6219.31] - - [2048, 1024, 1, 1536] - - [731, 9052.45] + - [734, 9052.45] - - [2048, 256, 1, 1024] - - [821, 7192.8] + - [824, 7192.8] - - [128, 200, 1, 512] - - [811, 502.577] + - [814, 502.577] - - [4096, 512, 1, 7168] - - [734, 9329.01] + - [737, 9329.01] - - [1024, 512, 1, 1152] - - [821, 7358.43] + - [824, 7358.43] - - [64, 1024, 1, 2048] - - [739, 2102.41] + - [742, 2102.41] - - [2048, 512, 1, 3328] - - [823, 8694.59] + - [826, 8694.59] - - [4096, 1024, 1, 896] - - [729, 9342.92] + - [732, 9342.92] - - [1, 1024, 1, 2048] - - [779, 40.8324] + - [782, 40.8324] - - [4096, 200, 1, 3584] - - [827, 6810.2] + - [830, 6810.2] - - [4096, 1024, 1, 4096] - - [729, 9347.46] + - [732, 9347.46] - - [1024, 256, 1, 14336] - - [722, 6625.7] + - [725, 6625.7] - - [2048, 200, 1, 256] - - [821, 4413.2] + - [824, 4413.2] - - [4096, 256, 1, 16384] - - [723, 8752.03] + - [726, 8752.03] - - [4096, 256, 1, 1920] - - [814, 8533.68] + - [817, 8533.68] - - [32, 1024, 1, 512] - - [812, 647.269] + - [815, 647.269] - - [1024, 256, 1, 7680] - - [832, 6387.26] + - [835, 6387.26] - - [2048, 256, 1, 1664] - - [832, 7631.34] + - [835, 7631.34] - - [512, 200, 1, 1536] - - [737, 2576.78] + - [740, 2576.78] - - [2048, 1024, 1, 6144] - - [718, 9033.67] + - [721, 9033.67] - - [512, 256, 1, 2816] - - [784, 3977.36] + - [787, 3977.36] - - [4096, 512, 1, 4160] - - [731, 9288.92] + - [734, 9288.92] - - [4096, 512, 1, 2080] - - [810, 9150.18] + - [813, 9150.18] - - [2048, 256, 1, 15360] - - [795, 7963.87] + - [798, 7963.87] - - [4096, 200, 1, 5120] - - [834, 6861.52] + - [837, 6861.52] - - [1024, 512, 1, 8192] - - [820, 7473.15] + - [823, 7473.15] - - [4096, 200, 1, 896] - - [837, 6443.15] + - [840, 6443.15] - - [2048, 512, 1, 8320] - - [827, 8810.14] + - [830, 8810.14] - - [1024, 1024, 1, 10240] - - [835, 8436.6] + - [838, 8436.6] - - [1024, 200, 1, 768] - - [821, 4087.48] + - [824, 4087.48] - - [2048, 200, 1, 640] - - [832, 5416.2] + - [835, 5416.2] - - [512, 200, 1, 2048] - - [786, 2702.52] + - [789, 2702.52] - - [1024, 1024, 1, 9216] - - [824, 8498.98] + - [827, 8498.98] - - [4096, 200, 1, 1408] - - [834, 6613.72] + - [837, 6613.72] - - [1024, 256, 1, 13312] - - [722, 6643.44] + - [725, 6643.44] - - [1024, 256, 1, 128] - - [753, 2706.0] + - [756, 2706.0] - - [2048, 200, 1, 5632] - - [832, 6270.02] + - [835, 6270.02] - - [64, 1024, 1, 512] - - [811, 1310.72] + - [814, 1310.72] - - [1024, 512, 1, 2560] - - [832, 7731.44] + - [835, 7731.44] - - [4096, 200, 1, 1280] - - [814, 6566.73] + - [817, 6566.73] - - [1024, 200, 1, 4096] - - [832, 4911.36] + - [835, 4911.36] - - [1024, 1024, 1, 2560] - - [823, 8630.25] + - [826, 8630.25] - - [2048, 512, 1, 64] - - [827, 4152.78] + - [830, 4152.78] - - [2048, 200, 1, 8192] - - [821, 6234.11] + - [824, 6234.11] - - [2048, 512, 1, 3072] - - [831, 8614.75] + - [834, 8614.75] - - [4096, 1024, 1, 5120] - - [729, 9573.65] + - [732, 9573.65] - - [4096, 256, 1, 640] - - [816, 7913.78] + - [819, 7913.78] - - [1024, 256, 1, 1280] - - [821, 5706.54] + - [824, 5706.54] - - [2048, 1024, 1, 1920] - - [731, 9141.24] + - [734, 9141.24] - - [2048, 256, 1, 4096] - - [821, 7937.18] + - [824, 7937.18] - - [2048, 1024, 1, 15360] - - [734, 9351.86] + - [737, 9351.86] - - [4096, 200, 1, 16384] - - [723, 6975.11] + - [726, 6975.11] - - [1, 1024, 1, 4096] - - [841, 60.6815] + - [844, 60.6815] - - [4096, 1024, 1, 2816] - - [729, 9583.88] + - [732, 9583.88] - - [4096, 200, 1, 1664] - - [816, 6658.6] + - [819, 6658.6] - - [4096, 512, 1, 256] - - [747, 7731.44] + - [750, 7731.44] - - [1024, 200, 1, 896] - - [821, 4193.35] + - [824, 4193.35] - - [2048, 200, 1, 6656] - - [832, 6291.07] + - [835, 6291.07] - - [2048, 1024, 1, 5120] - - [731, 9270.47] + - [734, 9270.47] - - [512, 1024, 1, 768] - - [821, 7098.96] + - [824, 7098.96] - - [2048, 512, 1, 14336] - - [801, 8559.03] + - [804, 8559.03] - - [2048, 200, 1, 8320] - - [821, 6314.62] + - [824, 6314.62] - - [4096, 256, 1, 3840] - - [837, 8718.46] + - [840, 8718.46] - - [2048, 1024, 1, 4096] - - [718, 8973.28] + - [721, 8973.28] - - [1024, 1024, 1, 3200] - - [827, 8701.88] + - [830, 8701.88] - - [1024, 256, 1, 4608] - - [821, 6267.95] + - [824, 6267.95] - - [4096, 512, 1, 4608] - - [729, 9316.37] + - [732, 9316.37] - - [2048, 512, 1, 2048] - - [820, 8462.66] + - [823, 8462.66] - - [4096, 512, 1, 1664] - - [729, 9074.43] + - [732, 9074.43] - - [4096, 256, 1, 4608] - - [816, 8717.95] + - [819, 8717.95] - - [1024, 512, 1, 32] - - [809, 1807.89] + - [812, 1807.89] - - [1024, 512, 1, 3840] - - [821, 7936.24] + - [824, 7936.24] - - [2048, 512, 1, 1920] - - [837, 8548.17] + - [840, 8548.17] - - [2048, 1024, 1, 896] - - [729, 8843.41] + - [732, 8843.41] - - [4096, 200, 1, 6144] - - [837, 6864.66] + - [840, 6864.66] - - [1024, 512, 1, 13312] - - [790, 7763.09] + - [793, 7763.09] - - [4096, 1024, 1, 4160] - - [729, 9650.62] + - [732, 9650.62] - - [2048, 200, 1, 2816] - - [821, 6119.66] + - [824, 6119.66] - - [1024, 1024, 1, 3840] - - [816, 8709.4] + - [819, 8709.4] - - [128, 1024, 1, 1024] - - [839, 2577.15] + - [842, 2577.15] - - [2048, 1024, 1, 11264] - - [734, 9338.96] + - [737, 9338.96] - - [2048, 1024, 1, 384] - - [823, 8210.71] + - [826, 8210.71] - - [1024, 256, 1, 2048] - - [844, 5755.48] + - [847, 5755.48] - - [2048, 1024, 1, 3840] - - [731, 9288.86] + - [734, 9288.86] - - [4096, 256, 1, 8320] - - [837, 8812.28] + - [840, 8812.28] - - [2048, 256, 1, 3840] - - [813, 7856.95] + - [816, 7856.95] - - [64, 256, 1, 512] - - [811, 336.082] + - [814, 336.082] - - [4096, 512, 1, 1280] - - [731, 8993.42] + - [734, 8993.42] - - [512, 256, 1, 1280] - - [763, 2995.93] + - [766, 2995.93] - - [1024, 512, 1, 7680] - - [821, 8041.49] + - [824, 8041.49] - - [4096, 1024, 1, 1152] - - [729, 9368.38] + - [732, 9368.38] - - [256, 200, 1, 512] - - [763, 992.97] + - [766, 992.97] - - [256, 1024, 1, 2048] - - [840, 4759.49] + - [843, 4759.49] - - [2048, 200, 1, 10240] - - [832, 6328.93] + - [835, 6328.93] - - [2048, 512, 1, 5120] - - [833, 8732.46] + - [836, 8732.46] - - [2048, 1024, 1, 1408] - - [731, 9006.8] + - [734, 9006.8] - - [512, 1024, 1, 512] - - [821, 6528.1] + - [824, 6528.1] - - [1024, 200, 1, 11264] - - [788, 5194.72] + - [791, 5194.72] - - [512, 1024, 1, 1024] - - [774, 6337.0] + - [777, 6337.0] - - [2048, 512, 1, 32] - - [740, 2777.68] + - [743, 2777.68] - - [4096, 256, 1, 2560] - - [823, 8621.39] + - [826, 8621.39] - - [4096, 256, 1, 64] - - [757, 4194.3] + - [760, 4194.3] - - [32, 1024, 1, 1024] - - [758, 778.164] + - [761, 778.164] - - [2048, 200, 1, 768] - - [832, 5507.23] + - [835, 5507.23] - - [512, 512, 1, 2048] - - [780, 5338.81] + - [783, 5338.81] - - [2048, 512, 1, 2560] - - [834, 8643.59] + - [837, 8643.59] - - [512, 256, 1, 512] - - [813, 2542.0] + - [816, 2542.0] - - [1024, 200, 1, 7680] - - [788, 5047.7] + - [791, 5047.7] - - [4096, 512, 1, 896] - - [729, 8856.75] + - [732, 8856.75] - - [4096, 1024, 1, 3072] - - [729, 9492.07] + - [732, 9492.07] - - [4096, 200, 1, 13312] - - [721, 6900.63] + - [724, 6900.63] - - [2048, 512, 1, 7168] - - [822, 8788.0] + - [825, 8788.0] - - [2048, 1024, 1, 2816] - - [734, 9229.78] + - [737, 9229.78] - - [2048, 512, 1, 128] - - [752, 5629.94] + - [755, 5629.94] - - [1024, 256, 1, 8192] - - [844, 6203.73] + - [847, 6203.73] - - [4096, 1024, 1, 1792] - - [729, 9510.32] + - [732, 9510.32] - - [1024, 200, 1, 6656] - - [813, 5002.75] + - [816, 5002.75] - - [1024, 1024, 1, 1024] - - [750, 8095.16] + - [753, 8095.16] - - [4096, 200, 1, 2304] - - [834, 6754.35] + - [837, 6754.35] - - [4096, 512, 1, 1152] - - [729, 8974.44] + - [732, 8974.44] - - [512, 200, 1, 1024] - - [811, 2232.91] + - [814, 2232.91] - - [1024, 256, 1, 3840] - - [832, 6244.62] + - [835, 6244.62] - - [512, 512, 1, 768] - - [821, 5331.74] + - [824, 5331.74] - - [2048, 512, 1, 4096] - - [831, 8621.66] + - [834, 8621.66] - - [2048, 256, 1, 2560] - - [821, 7770.83] + - [824, 7770.83] - - [2048, 256, 1, 4160] - - [832, 7922.98] + - [835, 7922.98] - - [1024, 256, 1, 64] - - [728, 1705.0] + - [731, 1705.0] - - [4096, 512, 1, 7680] - - [729, 9364.47] + - [732, 9364.47] - - [1024, 512, 1, 1664] - - [832, 7594.14] + - [835, 7594.14] - - [2048, 512, 1, 2080] - - [823, 8570.57] + - [826, 8570.57] - - [2048, 512, 1, 3840] - - [834, 8729.04] + - [837, 8729.04] - - [4096, 1024, 1, 384] - - [729, 8764.76] + - [732, 8764.76] - - [4096, 200, 1, 3072] - - [823, 6772.29] + - [826, 6772.29] - - [1024, 512, 1, 14336] - - [791, 7680.87] + - [794, 7680.87] - - [1024, 200, 1, 1920] - - [813, 4636.98] + - [816, 4636.98] - - [1024, 1024, 1, 1664] - - [827, 8506.39] + - [830, 8506.39] - - [512, 1024, 1, 2304] - - [821, 7775.23] + - [824, 7775.23] - - [2048, 1024, 1, 1792] - - [729, 9123.36] + - [732, 9123.36] - - [32, 200, 1, 512] - - [829, 125.644] + - [832, 125.644] - - [4096, 256, 1, 11264] - - [834, 8822.21] + - [837, 8822.21] - - [4096, 256, 1, 1408] - - [834, 8419.22] + - [837, 8419.22] - - [1024, 256, 1, 7168] - - [821, 6377.44] + - [824, 6377.44] - - [2048, 256, 1, 1152] - - [832, 7401.71] + - [835, 7401.71] - - [256, 256, 1, 512] - - [811, 1314.83] + - [814, 1314.83] - - [1024, 512, 1, 1280] - - [821, 7410.43] + - [824, 7410.43] - - [512, 512, 1, 1792] - - [813, 5931.34] + - [816, 5931.34] - - [2048, 200, 1, 12288] - - [795, 6242.15] + - [798, 6242.15] - - [2048, 200, 1, 1664] - - [832, 5953.65] + - [835, 5953.65] - - [4096, 200, 1, 4608] - - [827, 6853.44] + - [830, 6853.44] - - [512, 1024, 1, 2560] - - [821, 7778.03] + - [824, 7778.03] - - [4096, 200, 1, 384] - - [814, 5765.63] + - [817, 5765.63] - - [128, 512, 1, 512] - - [811, 1302.58] + - [814, 1302.58] - - [1024, 200, 1, 256] - - [815, 2861.83] + - [818, 2861.83] - - [256, 1024, 1, 1024] - - [756, 4522.16] + - [759, 4522.16] - - [2048, 200, 1, 128] - - [821, 3309.9] + - [824, 3309.9] - - [2048, 200, 1, 11264] - - [802, 6168.1] + - [805, 6168.1] - - [1024, 512, 1, 1920] - - [832, 7649.19] + - [835, 7649.19] - - [4096, 256, 1, 1536] - - [827, 8427.23] + - [830, 8427.23] - - [4096, 1024, 1, 3584] - - [729, 9617.9] + - [732, 9617.9] - - [2048, 256, 1, 256] - - [821, 5464.89] + - [824, 5464.89] - - [2048, 1024, 1, 768] - - [729, 8726.77] + - [732, 8726.77] - - [4096, 256, 1, 10240] - - [823, 8790.79] + - [826, 8790.79] - - [2048, 256, 1, 10240] - - [803, 7665.21] + - [806, 7665.21] - - [4096, 200, 1, 14336] - - [837, 6916.08] + - [840, 6916.08] - - [1024, 512, 1, 5120] - - [775, 7420.26] + - [778, 7420.26] - - [1024, 512, 1, 8320] - - [832, 8061.21] + - [835, 8061.21] - - [256, 200, 1, 2048] - - [787, 1916.26] + - [790, 1916.26] - - [1024, 200, 1, 640] - - [815, 3873.29] + - [818, 3873.29] - - [1024, 512, 1, 10240] - - [820, 7526.8] + - [823, 7526.8] - - [1024, 200, 1, 4160] - - [832, 4928.09] + - [835, 4928.09] - - [1024, 200, 1, 5632] - - [813, 4978.56] + - [816, 4978.56] - - [1024, 1024, 1, 2048] - - [768, 7937.18] + - [771, 7937.18] - - [1024, 256, 1, 6656] - - [832, 6373.58] + - [835, 6373.58] - - [2048, 1024, 1, 8320] - - [729, 9333.05] + - [732, 9333.05] - - [1024, 256, 1, 10240] - - [821, 6407.19] + - [824, 6407.19] - - [2048, 256, 1, 2080] - - [821, 7714.48] + - [824, 7714.48] - - [4096, 256, 1, 128] - - [735, 5765.37] + - [738, 5765.37] - - [1024, 256, 1, 768] - - [826, 5210.32] + - [829, 5210.32] - - [2048, 256, 1, 896] - - [832, 7267.36] + - [835, 7267.36] - - [64, 512, 1, 2048] - - [798, 1296.54] + - [801, 1296.54] - - [4096, 512, 1, 2048] - - [731, 9121.15] + - [734, 9121.15] - - [512, 256, 1, 2048] - - [784, 3283.21] + - [787, 3283.21] - - [4096, 256, 1, 16640] - - [816, 8839.78] + - [819, 8839.78] - - [4096, 512, 1, 2560] - - [734, 9222.05] + - [737, 9222.05] - - [1024, 512, 1, 15360] - - [785, 7865.56] + - [788, 7865.56] - - [4096, 1024, 1, 2304] - - [729, 9558.16] + - [732, 9558.16] - - [4096, 200, 1, 1152] - - [834, 6531.83] + - [837, 6531.83] - - [2048, 200, 1, 6144] - - [832, 6277.65] + - [835, 6277.65] - - [1024, 1024, 1, 7680] - - [827, 8799.24] + - [830, 8799.24] - - [2048, 200, 1, 1920] - - [832, 6030.92] + - [835, 6030.92] - - [32, 1024, 1, 2048] - - [806, 1174.88] + - [809, 1174.88] - - [1024, 200, 1, 3584] - - [813, 4880.34] + - [816, 4880.34] - - [4096, 256, 1, 2080] - - [820, 8557.12] + - [823, 8557.12] - - [1024, 1024, 1, 16384] - - [721, 8618.55] + - [724, 8618.55] - - [1024, 256, 1, 1408] - - [832, 5803.44] + - [835, 5803.44] - - [1024, 256, 1, 4096] - - [842, 6037.68] + - [845, 6037.68] - - [2048, 200, 1, 14336] - - [832, 6364.38] + - [835, 6364.38] - - [4096, 512, 1, 5120] - - [731, 9301.95] + - [734, 9301.95] - - [1024, 512, 1, 6144] - - [767, 7468.99] + - [770, 7468.99] - - [1024, 512, 1, 2304] - - [832, 7759.25] + - [835, 7759.25] - - [4096, 200, 1, 4160] - - [816, 6843.12] + - [819, 6843.12] - - [4096, 200, 1, 1536] - - [827, 6628.17] + - [830, 6628.17] - - [4096, 1024, 1, 6144] - - [729, 9592.98] + - [732, 9592.98] - - [256, 64, 1, 1225] - - [861, 1194.67] + - [864, 1194.67] - - [2048, 320, 1, 64] - - [863, 3449.26] + - [866, 3449.26] - - [1024, 128, 1, 289] - - [867, 2869.68] + - [870, 2869.68] - - [384, 64, 1, 1225] - - [852, 1511.33] + - [855, 1511.33] - - [2048, 384, 1, 64] - - [865, 3836.25] + - [868, 3836.25] - - [64, 80, 1, 5329] - - [864, 888.167] + - [867, 888.167] - - [1024, 384, 1, 289] - - [858, 4291.52] + - [861, 4291.52] - - [2048, 448, 1, 64] - - [857, 3783.52] + - [860, 3783.52] - - [768, 192, 1, 289] - - [862, 2690.33] + - [865, 2690.33] - - [288, 64, 1, 1225] - - [851, 1142.67] + - [854, 1142.67] - - [384, 96, 1, 1225] - - [869, 1844.71] + - [872, 1844.71] - - [1024, 3392, 1, 4096] - - [895, 8502.92] + - [898, 8502.92] - - [1024, 3301, 1, 4096] - - [897, 8414.0] + - [900, 8414.0] - - [1024, 3443, 1, 4096] - - [884, 8536.49] + - [887, 8536.49] - - [132, 134, 480, 64] - - [922, 4149.17] + - [925, 4149.17] - - [162, 162, 400, 64] - - [910, 5539.63] + - [913, 5539.63] - - [4096, 3548, 1, 1024] - - [876, 9772.91] + - [879, 9772.91] - - [4096, 2977, 1, 1024] - - [877, 9574.33] + - [880, 9574.33] - - [132, 135, 480, 64] - - [922, 4167.41] + - [925, 4167.41] - - [1024, 2985, 1, 4096] - - [880, 9133.89] + - [883, 9133.89] - - [33708, 3681, 1, 1024] - - [877, 10033.7] + - [880, 10033.7] - - [4096, 3443, 1, 1024] - - [877, 9513.68] + - [880, 9513.68] - - [11, 11, 5456, 64] - - [919, 627.246] + - [922, 627.246] - - [1024, 3400, 1, 4096] - - [898, 8419.92] + - [901, 8419.92] - - [4096, 3995, 1, 1024] - - [876, 9693.77] + - [879, 9693.77] - - [4096, 3190, 1, 1024] - - [876, 9474.74] + - [879, 9474.74] - - [4096, 3594, 1, 1024] - - [877, 9315.73] + - [880, 9315.73] - - [159, 162, 400, 64] - - [909, 5429.88] + - [912, 5429.88] - - [1024, 3565, 1, 4096] - - [892, 8532.7] + - [895, 8532.7] - - [4096, 3422, 1, 1024] - - [877, 9459.14] + - [880, 9459.14] - - [1024, 3214, 1, 4096] - - [897, 8064.82] + - [900, 8064.82] - - [33708, 3584, 1, 1024] - - [878, 10128.9] + - [881, 10128.9] - - [33708, 3640, 1, 1024] - - [875, 9919.12] + - [878, 9919.12] - - [4096, 3263, 1, 1024] - - [875, 9699.25] + - [878, 9699.25] - - [4096, 3296, 1, 1024] - - [875, 9780.7] + - [878, 9780.7] - - [1024, 3557, 1, 4096] - - [896, 8526.79] + - [899, 8526.79] - - [4096, 3463, 1, 1024] - - [875, 9578.03] + - [878, 9578.03] - - [4096, 3528, 1, 1024] - - [875, 9739.82] + - [878, 9739.82] - - [14, 14, 4368, 64] - - [907, 991.176] + - [910, 991.176] - - [4096, 3226, 1, 1024] - - [875, 9587.09] + - [878, 9587.09] - - [4096, 3439, 1, 1024] - - [878, 9499.62] + - [881, 9499.62] - - [1024, 3523, 1, 4096] - - [898, 8393.48] + - [901, 8393.48] - - [1024, 3098, 1, 4096] - - [904, 7882.77] + - [907, 7882.77] - - [4096, 3121, 1, 1024] - - [875, 9296.13] + - [878, 9296.13] - - [33708, 3894, 1, 1024] - - [876, 9952.17] + - [879, 9952.17] - - [1024, 3548, 1, 4096] - - [882, 8432.35] + - [885, 8432.35] - - [1024, 3451, 1, 4096] - - [895, 8456.34] + - [898, 8456.34] - - [4096, 3353, 1, 1024] - - [877, 9288.98] + - [880, 9288.98] - - [4096, 3402, 1, 1024] - - [877, 9406.34] + - [880, 9406.34] - - [4096, 3939, 1, 1024] - - [875, 9549.49] + - [878, 9549.49] - - [133, 133, 480, 64] - - [922, 4124.21] + - [925, 4124.21] - - [1024, 3559, 1, 4096] - - [897, 8586.94] + - [900, 8586.94] - - [1024, 2977, 1, 4096] - - [880, 9084.49] + - [883, 9084.49] - - [1024, 3478, 1, 4096] - - [891, 8342.75] + - [894, 8342.75] - - [134, 134, 480, 64] - - [924, 4204.33] + - [927, 4204.33] - - [1024, 3368, 1, 4096] - - [897, 8277.33] + - [900, 8277.33] - - [4096, 4012, 1, 1024] - - [877, 9726.47] + - [880, 9726.47] - - [4096, 3486, 1, 1024] - - [875, 9639.61] + - [878, 9639.61] - - [1024, 3479, 1, 4096] - - [885, 8420.27] + - [888, 8420.27] - - [1024, 3505, 1, 4096] - - [897, 8310.56] + - [900, 8310.56] - - [4096, 3381, 1, 1024] - - [878, 9357.65] + - [881, 9357.65] - - [4096, 3430, 1, 1024] - - [875, 9482.26] + - [878, 9482.26] - - [1024, 3554, 1, 4096] - - [897, 8592.28] + - [900, 8592.28] - - [4096, 3271, 1, 1024] - - [875, 9715.31] + - [878, 9715.31] - - [1024, 3063, 1, 4096] - - [879, 9388.46] + - [882, 9388.46] - - [1024, 3209, 1, 4096] - - [897, 8212.64] + - [900, 8212.64] - - [4096, 3503, 1, 1024] - - [877, 9680.49] + - [880, 9680.49] - - [4096, 3344, 1, 1024] - - [875, 9268.45] + - [878, 9268.45] - - [1024, 3147, 1, 4096] - - [898, 8037.1] + - [901, 8037.1] - - [1024, 3322, 1, 4096] - - [896, 8356.22] + - [899, 8356.22] - - [1024, 3341, 1, 4096] - - [897, 8316.23] + - [900, 8316.23] - - [1024, 3516, 1, 4096] - - [879, 8397.02] + - [882, 8397.02] - - [102, 101, 624, 64] - - [910, 4709.49] + - [913, 4709.49] - - [1024, 3454, 1, 4096] - - [896, 8425.5] + - [899, 8425.5] - - [4096, 3969, 1, 1024] - - [877, 9640.05] + - [880, 9640.05] - - [4096, 3466, 1, 1024] - - [877, 9576.73] + - [880, 9576.73] - - [1024, 3999, 1, 1024] - - [880, 9207.05] + - [883, 9207.05] - - [1024, 4032, 1, 1024] - - [881, 9294.46] + - [884, 9294.46] - - [1024, 3403, 1, 4096] - - [895, 8357.87] + - [898, 8357.87] - - [4096, 3361, 1, 1024] - - [877, 9308.68] + - [880, 9308.68] - - [1024, 3527, 1, 4096] - - [896, 8512.09] + - [899, 8512.09] - - [1024, 3822, 1, 4096] - - [880, 8991.03] + - [883, 8991.03] - - [4096, 3315, 1, 1024] - - [875, 9834.86] + - [878, 9834.86] - - [232, 232, 272, 64] - - [909, 6481.52] + - [912, 6481.52] - - [1024, 3336, 1, 4096] - - [898, 8295.51] + - [901, 8295.51] - - [228, 232, 272, 64] - - [910, 6327.75] + - [913, 6327.75] - - [4096, 3547, 1, 1024] - - [875, 9781.46] + - [878, 9781.46] - - [4096, 3340, 1, 1024] - - [877, 9269.62] + - [880, 9269.62] - - [1024, 3906, 1, 1024] - - [881, 9018.28] + - [884, 9018.28] - - [1024, 3295, 1, 4096] - - [895, 8194.73] + - [898, 8194.73] - - [4096, 3294, 1, 1024] - - [878, 9762.06] + - [881, 9762.06] - - [33708, 3968, 1, 1024] - - [878, 10147.7] + - [881, 10147.7] - - [1024, 3473, 1, 4096] - - [884, 8318.58] + - [887, 8318.58] - - [1024, 3072, 1, 4096] - - [881, 9370.03] + - [884, 9370.03] - - [4096, 3189, 1, 1024] - - [875, 9470.16] + - [878, 9470.16] - - [4096, 3494, 1, 1024] - - [875, 9661.22] + - [878, 9661.22] - - [1024, 3522, 1, 4096] - - [898, 8459.13] + - [901, 8459.13] - - [33708, 3944, 1, 1024] - - [878, 10060.1] + - [881, 10060.1] - - [135, 135, 480, 64] - - [923, 4256.93] + - [926, 4256.93] - - [4096, 3421, 1, 1024] - - [875, 9456.88] + - [878, 9456.88] - - [32, 32, 1984, 64] - - [920, 3436.14] + - [923, 3436.14] - - [4096, 3311, 1, 1024] - - [875, 9810.78] + - [878, 9810.78] - - [1024, 3990, 1, 1024] - - [882, 9197.64] + - [885, 9197.64] - - [1024, 3290, 1, 4096] - - [895, 8229.53] + - [898, 8229.53] - - [4096, 3565, 1, 1024] - - [876, 9824.38] + - [879, 9824.38] - - [1024, 3484, 1, 4096] - - [885, 8575.28] + - [888, 8575.28] - - [4096, 3384, 1, 1024] - - [875, 9366.44] + - [878, 9366.44] - - [1024, 3422, 1, 4096] - - [895, 8484.02] + - [898, 8484.02] - - [4096, 3681, 1, 1024] - - [876, 9520.06] + - [879, 9520.06] - - [1024, 3584, 1, 1024] - - [902, 8583.27] + - [905, 8583.27] - - [4096, 4050, 1, 1024] - - [877, 9807.25] + - [880, 9807.25] - - [1024, 3996, 1, 4096] - - [878, 9181.6] + - [881, 9181.6] - - [4096, 3169, 1, 1024] - - [876, 9411.3] + - [879, 9411.3] - - [4096, 3538, 1, 1024] - - [876, 9765.89] + - [879, 9765.89] - - [1024, 3495, 1, 4096] - - [882, 8295.85] + - [885, 8295.85] - - [4096, 3401, 1, 1024] - - [875, 9402.58] + - [878, 9402.58] - - [1024, 3560, 1, 4096] - - [896, 8513.35] + - [899, 8513.35] - - [133, 135, 480, 64] - - [923, 4198.98] + - [926, 4198.98] - - [1024, 3263, 1, 4096] - - [897, 8172.13] + - [900, 8172.13] - - [1024, 3870, 1, 4096] - - [877, 8996.17] + - [880, 8996.17] - - [4096, 3555, 1, 1024] - - [878, 9811.78] + - [881, 9811.78] - - [4096, 3412, 1, 1024] - - [875, 9431.99] + - [878, 9431.99] - - [101, 101, 624, 64] - - [909, 4667.59] + - [912, 4667.59] - - [1024, 3296, 1, 4096] - - [896, 8350.51] + - [899, 8350.51] - - [1024, 3379, 1, 4096] - - [898, 8432.84] + - [901, 8432.84] - - [4096, 3302, 1, 1024] - - [875, 9796.29] + - [878, 9796.29] - - [1024, 3490, 1, 4096] - - [895, 8538.34] + - [898, 8538.34] - - [1024, 3428, 1, 4096] - - [896, 8531.57] + - [899, 8531.57] - - [1024, 3976, 1, 4096] - - [877, 9327.77] + - [880, 9327.77] - - [4096, 3485, 1, 1024] - - [875, 9628.72] + - [878, 9628.72] - - [4096, 3534, 1, 1024] - - [875, 9755.87] + - [878, 9755.87] - - [1024, 3064, 1, 4096] - - [881, 9196.88] + - [884, 9196.88] - - [4096, 3216, 1, 1024] - - [877, 9563.34] + - [880, 9563.34] - - [1024, 3450, 1, 4096] - - [905, 8519.19] + - [908, 8519.19] - - [1024, 3533, 1, 4096] - - [896, 8495.67] + - [899, 8495.67] - - [1024, 4030, 1, 1024] - - [881, 9304.58] + - [884, 9304.58] - - [1024, 3311, 1, 4096] - - [896, 8278.5] + - [899, 8278.5] - - [1024, 3468, 1, 4096] - - [887, 8564.45] + - [890, 8564.45] - - [23, 23, 2720, 64] - - [911, 2311.45] + - [914, 2311.45] - - [4096, 3359, 1, 1024] - - [877, 9309.05] + - [880, 9309.05] - - [4096, 3392, 1, 1024] - - [877, 9388.09] + - [880, 9388.09] - - [1024, 3925, 1, 1024] - - [879, 9006.62] + - [882, 9006.62] - - [4096, 3233, 1, 1024] - - [875, 9603.54] + - [878, 9603.54] - - [4096, 3956, 1, 1024] - - [876, 9581.84] + - [879, 9581.84] - - [1024, 3463, 1, 4096] - - [897, 8293.87] + - [900, 8293.87] - - [1024, 3126, 1, 4096] - - [896, 7978.03] + - [899, 7978.03] - - [1024, 3363, 1, 4096] - - [889, 8267.37] + - [892, 8267.37] - - [4096, 3465, 1, 1024] - - [875, 9590.64] + - [878, 9590.64] - - [33708, 3996, 1, 1024] - - [876, 9899.89] + - [879, 9899.89] - - [1024, 3231, 1, 4096] - - [897, 8231.58] + - [900, 8231.58] - - [33708, 3978, 1, 1024] - - [876, 9853.54] + - [879, 9853.54] - - [4096, 3476, 1, 1024] - - [875, 9616.52] + - [878, 9616.52] - - [85, 85, 752, 64] - - [907, 4240.55] + - [910, 4240.55] - - [4096, 3339, 1, 1024] - - [877, 9249.71] + - [880, 9249.71] - - [4096, 3452, 1, 1024] - - [875, 9534.03] + - [878, 9534.03] - - [1024, 3396, 1, 4096] - - [896, 8451.13] + - [899, 8451.13] - - [4096, 3293, 1, 1024] - - [877, 9775.12] + - [880, 9775.12] - - [54, 54, 1184, 64] - - [909, 4153.44] + - [912, 4153.44] - - [1024, 3432, 1, 4096] - - [890, 8345.43] + - [893, 8345.43] - - [4096, 3493, 1, 1024] - - [878, 9649.8] + - [881, 9649.8] - - [4096, 3350, 1, 1024] - - [877, 9273.81] + - [880, 9273.81] - - [1024, 3079, 1, 4096] - - [905, 7775.56] + - [908, 7775.56] - - [1024, 3101, 1, 4096] - - [905, 7847.75] + - [908, 7847.75] - - [33708, 3939, 1, 1024] - - [878, 10054.3] + - [881, 10054.3] - - [4096, 3256, 1, 1024] - - [877, 9681.73] + - [880, 9681.73] - - [1024, 3439, 1, 4096] - - [896, 8531.01] + - [899, 8531.01] - - [1024, 3510, 1, 4096] - - [895, 8422.21] + - [898, 8422.21] - - [4096, 3900, 1, 1024] - - [876, 9468.51] + - [879, 9468.51] - - [1024, 3470, 1, 4096] - - [897, 8507.67] + - [900, 8507.67] - - [4096, 3456, 1, 1024] - - [877, 9577.36] + - [880, 9577.36] - - [4096, 3014, 1, 1024] - - [876, 9666.05] + - [879, 9666.05] - - [4096, 3367, 1, 1024] - - [878, 9328.26] + - [881, 9328.26] - - [4096, 3432, 1, 1024] - - [875, 9480.78] + - [878, 9480.78] - - [33708, 4026, 1, 1024] - - [878, 9972.73] + - [881, 9972.73] - - [4096, 3273, 1, 1024] - - [875, 9716.85] + - [878, 9716.85] - - [4096, 3130, 1, 1024] - - [875, 9311.3] + - [878, 9311.3] - - [1024, 3496, 1, 4096] - - [886, 8434.55] + - [889, 8434.55] - - [1024, 3995, 1, 4096] - - [871, 9157.63] + - [874, 9157.63] - - [1024, 3939, 1, 4096] - - [879, 9059.76] + - [882, 9059.76] - - [1024, 3121, 1, 4096] - - [903, 7963.33] + - [906, 7963.33] - - [1024, 3232, 1, 4096] - - [897, 8060.99] + - [900, 8060.99] - - [4096, 3147, 1, 1024] - - [877, 9364.53] + - [880, 9364.53] - - [4096, 3516, 1, 1024] - - [875, 9708.74] + - [878, 9708.74] - - [1024, 3969, 1, 1024] - - [881, 9168.58] + - [884, 9168.58] - - [1024, 3364, 1, 4096] - - [885, 8363.55] + - [888, 8363.55] - - [4096, 3411, 1, 1024] - - [878, 9442.67] + - [881, 9442.67] - - [147, 147, 432, 64] - - [922, 4843.11] + - [925, 4843.11] - - [4096, 3301, 1, 1024] - - [877, 9783.36] + - [880, 9783.36] - - [112, 111, 576, 64] - - [909, 5627.37] + - [912, 5627.37] - - [1024, 3513, 1, 4096] - - [896, 8725.31] + - [899, 8725.31] - - [1024, 3469, 1, 4096] - - [876, 8183.01] + - [879, 8183.01] - - [1024, 3095, 1, 4096] - - [897, 7887.77] + - [900, 7887.77] - - [4096, 3533, 1, 1024] - - [876, 9755.17] + - [879, 9755.17] - - [4096, 3390, 1, 1024] - - [875, 9377.11] + - [878, 9377.11] - - [4096, 3582, 1, 1024] - - [875, 9874.86] + - [878, 9874.86] - - [1024, 3956, 1, 1024] - - [881, 9058.72] + - [884, 9058.72] - - [4096, 3585, 1, 1024] - - [877, 9289.65] + - [880, 9289.65] - - [4096, 3231, 1, 1024] - - [876, 9597.05] + - [879, 9597.05] - - [1024, 3205, 1, 4096] - - [895, 8073.15] + - [898, 8073.15] - - [4096, 3496, 1, 1024] - - [876, 9668.28] + - [879, 9668.28] - - [1024, 3143, 1, 4096] - - [895, 8031.58] + - [898, 8031.58] - - [1024, 3318, 1, 4096] - - [892, 8261.33] + - [895, 8261.33] - - [1024, 3353, 1, 4096] - - [896, 8414.82] + - [899, 8414.82] - - [1024, 3464, 1, 4096] - - [895, 8309.93] + - [898, 8309.93] - - [4096, 2736, 1, 1024] - - [877, 9563.02] + - [880, 9563.02] - - [1024, 3402, 1, 4096] - - [892, 8413.74] + - [895, 8413.74] - - [4096, 3138, 1, 1024] - - [877, 9341.99] + - [880, 9341.99] - - [1024, 3860, 1, 4096] - - [880, 9008.47] + - [883, 9008.47] - - [148, 148, 432, 64] - - [922, 4915.6] + - [925, 4915.6] - - [1024, 3539, 1, 4096] - - [892, 8449.26] + - [895, 8449.26] - - [4096, 3211, 1, 1024] - - [877, 9551.18] + - [880, 9551.18] - - [1024, 3332, 1, 4096] - - [885, 8295.01] + - [888, 8295.01] - - [1024, 3466, 1, 4096] - - [896, 8339.15] + - [899, 8339.15] - - [4096, 3475, 1, 1024] - - [875, 9612.23] + - [878, 9612.23] - - [4096, 3524, 1, 1024] - - [878, 9722.64] + - [881, 9722.64] - - [4096, 2985, 1, 1024] - - [878, 9591.23] + - [881, 9591.23] - - [4096, 3222, 1, 1024] - - [875, 9577.38] + - [878, 9577.38] - - [4096, 3451, 1, 1024] - - [877, 9541.32] + - [880, 9541.32] - - [1024, 3181, 1, 4096] - - [895, 8118.79] + - [898, 8118.79] - - [1024, 3640, 1, 4096] - - [880, 8617.01] + - [883, 8617.01] - - [1024, 3375, 1, 4096] - - [884, 8419.65] + - [887, 8419.65] - - [1024, 3550, 1, 4096] - - [897, 8512.73] + - [900, 8512.73] - - [1024, 4020, 1, 1024] - - [881, 9266.8] + - [884, 9266.8] - - [1024, 3840, 1, 4096] - - [880, 8983.39] + - [883, 8983.39] - - [4096, 3349, 1, 1024] - - [875, 9279.86] + - [878, 9279.86] - - [4096, 3398, 1, 1024] - - [876, 9402.22] + - [879, 9402.22] - - [33708, 3976, 1, 1024] - - [877, 9849.44] + - [880, 9849.44] - - [1024, 2917, 1, 4096] - - [882, 8936.77] + - [885, 8936.77] - - [33708, 3910, 1, 1024] - - [875, 9983.25] + - [878, 9983.25] - - [4096, 3860, 1, 1024] - - [876, 9377.48] + - [879, 9377.48] - - [4096, 3304, 1, 1024] - - [878, 9798.34] + - [881, 9798.34] - - [1024, 3286, 1, 4096] - - [883, 8167.31] + - [886, 8167.31] - - [1024, 3460, 1, 4096] - - [893, 8539.46] + - [896, 8539.46] - - [1024, 4026, 1, 4096] - - [879, 9305.58] + - [882, 9305.58] - - [4096, 3471, 1, 1024] - - [877, 9596.61] + - [880, 9596.61] - - [193, 193, 320, 64] - - [925, 4758.36] + - [928, 4758.36] - - [1024, 3894, 1, 1024] - - [879, 8979.5] + - [882, 8979.5] - - [65, 65, 992, 64] - - [921, 2565.39] + - [924, 2565.39] - - [1024, 3506, 1, 4096] - - [893, 8593.12] + - [896, 8593.12] - - [35, 35, 1808, 64] - - [915, 2129.62] + - [918, 2129.62] - - [1024, 4000, 1, 1024] - - [879, 9204.5] + - [882, 9204.5] - - [1024, 3900, 1, 4096] - - [875, 9050.26] + - [878, 9050.26] - - [1024, 3445, 1, 4096] - - [898, 8551.55] + - [901, 8551.55] - - [4096, 3442, 1, 1024] - - [876, 9504.9] + - [879, 9504.9] - - [1024, 3358, 1, 4096] - - [897, 8437.06] + - [900, 8437.06] - - [13, 13, 4672, 64] - - [908, 860.565] + - [911, 860.565] - - [1024, 3211, 1, 4096] - - [901, 8085.15] + - [904, 8085.15] - - [4096, 3515, 1, 1024] - - [877, 9715.19] + - [880, 9715.19] - - [1024, 3564, 1, 4096] - - [883, 8760.27] + - [886, 8760.27] - - [4096, 3057, 1, 1024] - - [877, 9803.95] + - [880, 9803.95] - - [1024, 3343, 1, 4096] - - [895, 8363.7] + - [898, 8363.7] - - [4096, 3262, 1, 1024] - - [876, 9686.39] + - [879, 9686.39] - - [1024, 3518, 1, 4096] - - [895, 8454.95] + - [898, 8454.95] - - [77, 77, 816, 64] - - [914, 3505.84] + - [917, 3505.84] - - [33708, 3876, 1, 1024] - - [876, 9895.85] + - [879, 9895.85] - - [4096, 3462, 1, 1024] - - [877, 9570.21] + - [880, 9570.21] - - [1024, 3265, 1, 4096] - - [895, 8322.65] + - [898, 8322.65] - - [4096, 3389, 1, 1024] - - [876, 9382.76] + - [879, 9382.76] - - [4096, 3438, 1, 1024] - - [877, 9503.37] + - [880, 9503.37] - - [1024, 3955, 1, 1024] - - [879, 9064.35] + - [882, 9064.35] - - [1024, 3545, 1, 4096] - - [898, 8652.31] + - [901, 8652.31] - - [1024, 3144, 1, 4096] - - [898, 8060.45] + - [901, 8060.45] - - [1024, 3417, 1, 4096] - - [896, 8505.81] + - [899, 8505.81] - - [4096, 3543, 1, 1024] - - [875, 9775.57] + - [878, 9775.57] - - [4096, 3352, 1, 1024] - - [877, 9282.77] + - [880, 9282.77] - - [33708, 3975, 1, 1024] - - [878, 9849.39] + - [881, 9849.39] - - [148, 147, 432, 64] - - [922, 4876.05] + - [925, 4876.05] - - [4096, 3137, 1, 1024] - - [875, 9330.53] + - [878, 9330.53] - - [4096, 3506, 1, 1024] - - [878, 9682.66] + - [881, 9682.66] - - [1024, 3975, 1, 1024] - - [881, 9164.67] + - [884, 9164.67] - - [1024, 3859, 1, 4096] - - [879, 8983.74] + - [882, 8983.74] - - [4096, 3369, 1, 1024] - - [877, 9330.35] + - [880, 9330.35] - - [1024, 3434, 1, 4096] - - [895, 8486.88] + - [898, 8486.88] - - [1024, 3292, 1, 4096] - - [895, 8478.86] + - [898, 8478.86] - - [4096, 3523, 1, 1024] - - [875, 9734.73] + - [878, 9734.73] - - [4096, 3380, 1, 1024] - - [877, 9354.39] + - [880, 9354.39] - - [1024, 3408, 1, 4096] - - [898, 8440.93] + - [901, 8440.93] - - [4096, 3221, 1, 1024] - - [877, 9575.49] + - [880, 9575.49] - - [4096, 3270, 1, 1024] - - [877, 9717.85] + - [880, 9717.85] - - [143, 143, 432, 64] - - [923, 4643.35] + - [926, 4643.35] - - [111, 111, 576, 64] - - [915, 5474.94] + - [918, 5474.94] - - [1024, 3303, 1, 4096] - - [897, 8412.97] + - [900, 8412.97] - - [4096, 3502, 1, 1024] - - [877, 9679.77] + - [880, 9679.77] - - [1024, 3222, 1, 4096] - - [897, 8141.78] + - [900, 8141.78] - - [4096, 2505, 1, 1024] - - [875, 9594.85] + - [878, 9594.85] - - [4096, 3397, 1, 1024] - - [875, 9392.51] + - [878, 9392.51] - - [4096, 3562, 1, 1024] - - [875, 9827.48] + - [878, 9827.48] - - [4096, 3095, 1, 1024] - - [877, 9222.35] + - [880, 9222.35] - - [1024, 3226, 1, 4096] - - [893, 8026.93] + - [896, 8026.93] - - [177, 177, 352, 64] - - [910, 6406.86] + - [913, 6406.86] - - [4096, 3360, 1, 1024] - - [876, 9298.05] + - [879, 9298.05] - - [1024, 3942, 1, 1024] - - [881, 9061.49] + - [884, 9061.49] - - [1024, 3298, 1, 4096] - - [898, 8254.26] + - [901, 8254.26] - - [1024, 3381, 1, 4096] - - [897, 8508.71] + - [900, 8508.71] - - [4096, 3314, 1, 1024] - - [877, 9837.46] + - [880, 9837.46] - - [1024, 3492, 1, 4096] - - [885, 8583.29] + - [888, 8583.29] - - [1024, 3430, 1, 4096] - - [885, 8492.61] + - [888, 8492.61] - - [4096, 3977, 1, 1024] - - [877, 9656.35] + - [880, 9656.35] - - [4096, 3546, 1, 1024] - - [875, 9780.25] + - [878, 9780.25] - - [4096, 3640, 1, 1024] - - [875, 9415.41] + - [878, 9415.41] - - [4096, 3441, 1, 1024] - - [876, 9499.14] + - [879, 9499.14] - - [33708, 4059, 1, 1024] - - [878, 10051.8] + - [881, 10051.8] - - [1024, 3978, 1, 1024] - - [879, 9158.7] + - [882, 9158.7] - - [1024, 3376, 1, 4096] - - [897, 8415.34] + - [900, 8415.34] - - [1024, 3482, 1, 4096] - - [898, 8396.52] + - [901, 8396.52] - - [1024, 3563, 1, 4096] - - [881, 8424.08] + - [884, 8424.08] - - [4096, 4020, 1, 1024] - - [878, 9745.86] + - [881, 9745.86] - - [1024, 3271, 1, 4096] - - [896, 8289.58] + - [899, 8289.58] - - [1024, 3291, 1, 4096] - - [896, 8222.61] + - [899, 8222.61] - - [1024, 3431, 1, 4096] - - [891, 8464.3] + - [894, 8464.3] - - [1024, 3481, 1, 4096] - - [897, 8386.4] + - [900, 8386.4] - - [84, 85, 752, 64] - - [912, 4194.75] + - [915, 4194.75] - - [4096, 3461, 1, 1024] - - [875, 9579.57] + - [878, 9579.57] - - [1024, 3574, 1, 4096] - - [898, 8579.7] + - [901, 8579.7] - - [1024, 4059, 1, 1024] - - [879, 9330.44] + - [882, 9330.44] - - [84, 84, 752, 64] - - [919, 4141.36] + - [922, 4141.36] - - [1024, 3421, 1, 4096] - - [898, 8528.32] + - [901, 8528.32] - - [4096, 3224, 1, 1024] - - [877, 9589.85] + - [880, 9589.85] - - [4096, 3437, 1, 1024] - - [877, 9498.1] + - [880, 9498.1] - - [45, 45, 1424, 64] - - [909, 3314.48] + - [912, 3314.48] - - [4096, 3840, 1, 1024] - - [875, 9931.27] + - [878, 9931.27] - - [4096, 3168, 1, 1024] - - [877, 9412.06] + - [880, 9412.06] - - [33708, 3990, 1, 1024] - - [875, 9884.29] + - [878, 9884.29] - - [1024, 3349, 1, 4096] - - [897, 8421.3] + - [900, 8421.3] - - [4096, 3335, 1, 1024] - - [875, 9241.55] + - [878, 9241.55] - - [4096, 3400, 1, 1024] - - [877, 9407.25] + - [880, 9407.25] - - [160, 159, 400, 64] - - [924, 5708.84] + - [927, 5708.84] - - [1024, 3398, 1, 4096] - - [897, 8623.93] + - [900, 8623.93] - - [1024, 3780, 1, 4096] - - [877, 8756.68] + - [880, 8756.68] - - [29, 29, 2176, 64] - - [920, 2963.59] + - [923, 2963.59] - - [4096, 3098, 1, 1024] - - [875, 9229.72] + - [878, 9229.72] - - [1024, 4012, 1, 4096] - - [881, 9421.93] + - [884, 9421.93] - - [4096, 3505, 1, 1024] - - [877, 9687.55] + - [880, 9687.55] - - [4096, 3554, 1, 1024] - - [877, 9812.12] + - [880, 9812.12] - - [4096, 3063, 1, 1024] - - [877, 9825.0] + - [880, 9825.0] - - [1024, 3503, 1, 4096] - - [895, 8404.64] + - [898, 8404.64] - - [1024, 3166, 1, 4096] - - [898, 8084.83] + - [901, 8084.83] - - [1024, 3425, 1, 4096] - - [898, 8537.48] + - [901, 8537.48] - - [1024, 3344, 1, 4096] - - [889, 8351.06] + - [892, 8351.06] - - [4096, 3484, 1, 1024] - - [877, 9635.6] + - [880, 9635.6] - - [1024, 3681, 1, 1024] - - [880, 8457.08] + - [883, 8457.08] - - [1024, 4050, 1, 1024] - - [881, 9326.11] + - [884, 9326.11] - - [4096, 3379, 1, 1024] - - [875, 9356.06] + - [878, 9356.06] - - [4096, 3428, 1, 1024] - - [876, 9472.23] + - [879, 9472.23] - - [12, 12, 5040, 64] - - [914, 741.517] + - [917, 741.517] - - [27, 27, 2336, 64] - - [920, 2757.8] + - [923, 2757.8] - - [1024, 3304, 1, 4096] - - [898, 8317.72] + - [901, 8317.72] - - [1024, 3387, 1, 4096] - - [896, 8460.05] + - [899, 8460.05] - - [4096, 3126, 1, 1024] - - [878, 9308.38] + - [881, 9308.38] - - [1024, 3498, 1, 4096] - - [895, 8485.45] + - [898, 8485.45] - - [1024, 3436, 1, 4096] - - [897, 8397.61] + - [900, 8397.61] - - [4096, 3501, 1, 1024] - - [875, 9681.09] + - [878, 9681.09] - - [4096, 3358, 1, 1024] - - [877, 9304.8] + - [880, 9304.8] - - [4096, 3232, 1, 1024] - - [875, 9607.1] + - [878, 9607.1] - - [1024, 3585, 1, 4096] - - [879, 8510.64] + - [882, 8510.64] - - [4096, 3143, 1, 1024] - - [878, 9355.81] + - [881, 9355.81] - - [4096, 3464, 1, 1024] - - [877, 9585.85] + - [880, 9585.85] - - [1024, 3366, 1, 4096] - - [885, 8275.13] + - [888, 8275.13] - - [4096, 3375, 1, 1024] - - [875, 9342.03] + - [878, 9342.03] - - [4096, 2917, 1, 1024] - - [875, 9372.74] + - [878, 9372.74] - - [4096, 4026, 1, 1024] - - [877, 9759.05] + - [880, 9759.05] - - [49, 49, 1296, 64] - - [916, 3709.92] + - [919, 3709.92] - - [1024, 3277, 1, 4096] - - [896, 8217.0] + - [899, 8217.0] - - [1024, 3103, 1, 4096] - - [897, 7872.57] + - [900, 7872.57] - - [33708, 3995, 1, 1024] - - [877, 9892.98] + - [880, 9892.98] - - [1024, 3297, 1, 4096] - - [896, 8185.72] + - [899, 8185.72] - - [4096, 3545, 1, 1024] - - [877, 9789.33] + - [880, 9789.33] - - [1024, 3399, 1, 4096] - - [896, 8377.08] + - [899, 8377.08] - - [33708, 3796, 1, 1024] - - [876, 10007.9] + - [879, 10007.9] - - [4096, 3292, 1, 1024] - - [877, 9767.18] + - [880, 9767.18] - - [71, 71, 896, 64] - - [911, 3006.15] + - [914, 3006.15] - - [33708, 3859, 1, 1024] - - [878, 9860.27] + - [881, 9860.27] - - [4096, 3566, 1, 1024] - - [877, 9834.37] + - [880, 9834.37] - - [4096, 3894, 1, 1024] - - [875, 9456.57] + - [878, 9456.57] - - [4096, 3492, 1, 1024] - - [875, 9653.14] + - [878, 9653.14] - - [1024, 3977, 1, 1024] - - [881, 9161.23] + - [884, 9161.23] - - [1024, 3272, 1, 4096] - - [898, 8256.99] + - [901, 8256.99] - - [135, 134, 480, 64] - - [922, 4238.29] + - [925, 4238.29] - - [1024, 3355, 1, 4096] - - [896, 8374.54] + - [899, 8374.54] - - [4096, 3419, 1, 1024] - - [878, 9455.34] + - [881, 9455.34] - - [1024, 3404, 1, 4096] - - [897, 8580.18] + - [900, 8580.18] - - [4096, 3999, 1, 1024] - - [877, 9701.68] + - [880, 9701.68] - - [4096, 3166, 1, 1024] - - [875, 9410.38] + - [878, 9410.38] - - [33708, 3840, 1, 1024] - - [878, 10132.8] + - [881, 10132.8] - - [4096, 4032, 1, 1024] - - [878, 9762.76] + - [881, 9762.76] - - [1024, 3573, 1, 4096] - - [896, 8603.3] + - [899, 8603.3] - - [4096, 3366, 1, 1024] - - [878, 9322.53] + - [881, 9322.53] - - [1024, 3541, 1, 4096] - - [898, 8405.8] + - [901, 8405.8] - - [4096, 3207, 1, 1024] - - [875, 9544.15] + - [878, 9544.15] - - [4096, 3272, 1, 1024] - - [877, 9716.63] + - [880, 9716.63] - - [1024, 3334, 1, 4096] - - [895, 8241.29] + - [898, 8241.29] - - [228, 228, 272, 64] - - [910, 6232.35] + - [913, 6232.35] - - [4096, 3183, 1, 1024] - - [877, 9452.34] + - [880, 9452.34] - - [4096, 3536, 1, 1024] - - [876, 9759.34] + - [879, 9759.34] - - [1024, 4005, 1, 1024] - - [880, 9225.73] + - [883, 9225.73] - - [1024, 3245, 1, 4096] - - [897, 8074.21] + - [900, 8074.21] - - [4096, 3447, 1, 1024] - - [876, 9525.74] + - [879, 9525.74] - - [1024, 3183, 1, 4096] - - [896, 8121.52] + - [899, 8121.52] - - [1024, 3361, 1, 4096] - - [898, 8285.76] + - [901, 8285.76] - - [33708, 3870, 1, 1024] - - [876, 9879.25] + - [879, 9879.25] - - [1024, 3321, 1, 4096] - - [897, 8408.57] + - [900, 8408.57] - - [1024, 3968, 1, 1024] - - [879, 9201.95] + - [882, 9201.95] - - [1024, 3486, 1, 4096] - - [893, 8258.79] + - [896, 8258.79] - - [4096, 4005, 1, 1024] - - [877, 9723.88] + - [880, 9723.88] - - [4096, 3410, 1, 1024] - - [878, 9440.4] + - [881, 9440.4] - - [1024, 3944, 1, 1024] - - [881, 9040.72] + - [884, 9040.72] - - [4096, 3300, 1, 1024] - - [876, 9789.8] + - [879, 9789.8] - - [4096, 3579, 1, 1024] - - [878, 9859.34] + - [881, 9859.34] - - [4096, 3483, 1, 1024] - - [878, 9624.21] + - [881, 9624.21] - - [4096, 3532, 1, 1024] - - [877, 9742.66] + - [880, 9742.66] - - [1024, 3140, 1, 4096] - - [897, 7899.55] + - [900, 7899.55] - - [1024, 3372, 1, 4096] - - [895, 8236.97] + - [898, 8236.97] - - [1024, 3224, 1, 4096] - - [898, 8159.03] + - [901, 8159.03] - - [4096, 3230, 1, 1024] - - [877, 9601.15] + - [880, 9601.15] - - [4096, 3427, 1, 1024] - - [877, 9466.47] + - [880, 9466.47] - - [1024, 3796, 1, 1024] - - [881, 8739.68] + - [884, 8739.68] - - [143, 148, 432, 64] - - [922, 4761.9] + - [925, 4761.9] - - [1024, 3616, 1, 4096] - - [880, 8445.79] + - [883, 8445.79] - - [1024, 3315, 1, 4096] - - [897, 8403.11] + - [900, 8403.11] - - [1024, 3476, 1, 4096] - - [895, 8523.58] + - [898, 8523.58] - - [1024, 3509, 1, 4096] - - [895, 8344.95] + - [898, 8344.95] - - [4096, 3357, 1, 1024] - - [877, 9300.06] + - [880, 9300.06] - - [4096, 3406, 1, 1024] - - [877, 9427.34] + - [880, 9427.34] - - [1024, 3558, 1, 4096] - - [896, 8525.68] + - [899, 8525.68] - - [4096, 3593, 1, 1024] - - [877, 9302.1] + - [880, 9302.1] - - [4096, 3247, 1, 1024] - - [877, 9648.4] + - [880, 9648.4] - - [4096, 3088, 1, 1024] - - [877, 9204.11] + - [880, 9204.11] - - [1024, 3213, 1, 4096] - - [895, 8054.21] + - [898, 8054.21] - - [4096, 3511, 1, 1024] - - [875, 9702.6] + - [878, 9702.6] - - [122, 122, 528, 64] - - [916, 6293.29] + - [919, 6293.29] - - [1024, 3365, 1, 4096] - - [892, 8413.52] + - [895, 8413.52] - - [1024, 3504, 1, 4096] - - [894, 8414.36] + - [897, 8414.36] - - [1024, 3442, 1, 4096] - - [897, 8683.9] + - [900, 8683.9] - - [4096, 3474, 1, 1024] - - [875, 9611.5] + - [878, 9611.5] - - [4096, 2984, 1, 1024] - - [876, 9592.72] + - [879, 9592.72] - - [1024, 3876, 1, 4096] - - [879, 9085.85] + - [882, 9085.85] - - [4096, 3337, 1, 1024] - - [877, 9246.12] + - [880, 9246.12] - - [4096, 3450, 1, 1024] - - [877, 9534.53] + - [880, 9534.53] - - [1024, 3547, 1, 4096] - - [897, 8386.63] + - [900, 8386.63] - - [4096, 3291, 1, 1024] - - [876, 9759.24] + - [879, 9759.24] - - [1024, 3340, 1, 4096] - - [896, 8237.87] + - [899, 8237.87] - - [4096, 3491, 1, 1024] - - [877, 9656.49] + - [880, 9656.49] - - [4096, 3348, 1, 1024] - - [877, 9279.05] + - [880, 9279.05] - - [78, 78, 816, 64] - - [917, 3590.99] + - [920, 3590.99] - - [4096, 3968, 1, 1024] - - [878, 9642.09] + - [881, 9642.09] - - [4096, 3906, 1, 1024] - - [878, 9485.27] + - [881, 9485.27] - - [1024, 3477, 1, 4096] - - [885, 8389.1] + - [888, 8389.1] - - [1024, 3397, 1, 4096] - - [895, 8556.78] + - [898, 8556.78] - - [4096, 3165, 1, 1024] - - [876, 9415.42] + - [879, 9415.42] - - [4096, 3470, 1, 1024] - - [875, 9598.4] + - [878, 9598.4] - - [1024, 3526, 1, 4096] - - [895, 8442.05] + - [898, 8442.05] - - [112, 112, 576, 64] - - [910, 5672.5] + - [913, 5672.5] - - [4096, 3365, 1, 1024] - - [875, 9321.73] + - [878, 9321.73] - - [4096, 3319, 1, 1024] - - [875, 9838.38] + - [878, 9838.38] - - [1024, 3401, 1, 4096] - - [897, 8460.76] + - [900, 8460.76] - - [1024, 3294, 1, 4096] - - [896, 8324.53] + - [899, 8324.53] - - [159, 159, 400, 64] - - [912, 5488.41] + - [915, 5488.41] - - [1024, 3472, 1, 4096] - - [890, 8289.67] + - [893, 8289.67] - - [4096, 3328, 1, 1024] - - [876, 9904.25] + - [879, 9904.25] - - [1024, 3861, 1, 1024] - - [881, 8917.53] + - [884, 8917.53] - - [1024, 3910, 1, 1024] - - [879, 9010.06] + - [882, 9010.06] - - [1024, 3410, 1, 4096] - - [897, 8519.53] + - [900, 8519.53] - - [1024, 3395, 1, 4096] - - [895, 8424.25] + - [898, 8424.25] - - [4096, 3282, 1, 1024] - - [875, 9743.57] + - [878, 9743.57] - - [1024, 3751, 1, 1024] - - [882, 8680.29] + - [885, 8680.29] - - [4096, 3145, 1, 1024] - - [877, 9353.27] + - [880, 9353.27] - - [4096, 3514, 1, 1024] - - [877, 9712.94] + - [880, 9712.94] - - [4096, 3944, 1, 1024] - - [877, 9563.82] + - [880, 9563.82] - - [1024, 3515, 1, 4096] - - [896, 8428.03] + - [899, 8428.03] - - [4096, 3409, 1, 1024] - - [876, 9428.67] + - [879, 9428.67] - - [4096, 3564, 1, 1024] - - [875, 9823.69] + - [878, 9823.69] - - [4096, 3299, 1, 1024] - - [877, 9792.93] + - [880, 9792.93] - - [1024, 3057, 1, 4096] - - [873, 9237.75] + - [876, 9237.75] - - [4096, 3531, 1, 1024] - - [875, 9745.54] + - [878, 9745.54] - - [4096, 3388, 1, 1024] - - [877, 9374.55] + - [880, 9374.55] - - [1024, 3189, 1, 4096] - - [897, 8084.5] + - [900, 8084.5] - - [1024, 3300, 1, 4096] - - [897, 8185.03] + - [900, 8185.03] - - [1024, 3720, 1, 4096] - - [876, 8755.01] + - [879, 8755.01] - - [1024, 3383, 1, 4096] - - [890, 8463.37] + - [893, 8463.37] - - [1024, 3494, 1, 4096] - - [897, 8676.47] + - [900, 8676.47] - - [77, 78, 816, 64] - - [913, 3548.16] + - [916, 3548.16] - - [1024, 3448, 1, 4096] - - [895, 8665.68] + - [898, 8665.68] - - [4096, 3542, 1, 1024] - - [875, 9771.78] + - [878, 9771.78] - - [1024, 3488, 1, 4096] - - [895, 8488.29] + - [898, 8488.29] - - [4096, 3405, 1, 1024] - - [877, 9426.06] + - [880, 9426.06] - - [1024, 3262, 1, 4096] - - [897, 8206.87] + - [900, 8206.87] - - [33708, 4005, 1, 1024] - - [878, 9928.06] + - [881, 9928.06] - - [1024, 3594, 1, 4096] - - [882, 8458.47] + - [885, 8458.47] - - [4096, 3103, 1, 1024] - - [878, 9243.04] + - [881, 9243.04] - - [4096, 3136, 1, 1024] - - [877, 9340.8] + - [880, 9340.8] - - [1024, 3378, 1, 4096] - - [898, 8432.35] + - [901, 8432.35] - - [10, 10, 5952, 64] - - [918, 523.253] + - [921, 523.253] - - [7, 7, 8192, 64] - - [918, 260.443] + - [921, 260.443] - - [4096, 3559, 1, 1024] - - [877, 9813.0] + - [880, 9813.0] - - [4096, 3368, 1, 1024] - - [878, 9328.56] + - [881, 9328.56] - - [4096, 3209, 1, 1024] - - [875, 9538.73] + - [878, 9538.73] - - [4096, 3322, 1, 1024] - - [877, 9839.48] + - [880, 9839.48] - - [1024, 3483, 1, 4096] - - [883, 8348.25] + - [886, 8348.25] - - [4096, 3473, 1, 1024] - - [876, 9605.69] + - [879, 9605.69] - - [4096, 3522, 1, 1024] - - [878, 9729.92] + - [881, 9729.92] - - [1024, 3532, 1, 4096] - - [896, 8474.22] + - [899, 8474.22] - - [4096, 3449, 1, 1024] - - [877, 9528.25] + - [880, 9528.25] - - [1024, 3351, 1, 4096] - - [898, 8311.13] + - [901, 8311.13] - - [1024, 3462, 1, 4096] - - [895, 8297.54] + - [898, 8297.54] - - [4096, 3396, 1, 1024] - - [877, 9400.15] + - [880, 9400.15] - - [132, 132, 480, 64] - - [923, 4089.74] + - [926, 4089.74] - - [111, 112, 576, 64] - - [909, 5529.6] + - [912, 5529.6] - - [1024, 3416, 1, 4096] - - [896, 8556.54] + - [899, 8556.54] - - [4096, 3469, 1, 1024] - - [878, 9598.67] + - [881, 9598.67] - - [1024, 3582, 1, 4096] - - [879, 8461.37] + - [882, 8461.37] - - [1024, 3230, 1, 4096] - - [896, 8188.84] + - [899, 8188.84] - - [1024, 3489, 1, 4096] - - [897, 8457.75] + - [900, 8457.75] - - [1024, 3427, 1, 4096] - - [897, 8566.49] + - [900, 8566.49] - - [1024, 3346, 1, 4096] - - [896, 8352.07] + - [899, 8352.07] - - [33708, 3977, 1, 1024] - - [878, 9868.4] + - [881, 9868.4] - - [4096, 3796, 1, 1024] - - [877, 9797.66] + - [880, 9797.66] - - [4096, 3176, 1, 1024] - - [877, 9435.29] + - [880, 9435.29] - - [4096, 3990, 1, 1024] - - [875, 9672.23] + - [878, 9672.23] - - [1024, 3257, 1, 4096] - - [898, 8225.07] + - [901, 8225.07] - - [4096, 3343, 1, 1024] - - [899, 9273.52] + - [902, 9273.52] - - [4096, 3440, 1, 1024] - - [875, 9501.38] + - [878, 9501.38] - - [33708, 4030, 1, 1024] - - [876, 9983.26] + - [879, 9983.26] - - [1024, 3190, 1, 4096] - - [897, 8192.01] + - [900, 8192.01] - - [1024, 3389, 1, 4096] - - [898, 8439.32] + - [901, 8439.32] - - [1024, 3500, 1, 4096] - - [896, 8556.02] + - [899, 8556.02] - - [1024, 3471, 1, 4096] - - [885, 8491.07] + - [888, 8491.07] - - [1024, 3438, 1, 4096] - - [898, 8567.85] + - [901, 8567.85] - - [4096, 3513, 1, 1024] - - [875, 9710.17] + - [878, 9710.17] - - [1024, 3562, 1, 4096] - - [890, 8608.84] + - [893, 8608.84] - - [4096, 3616, 1, 1024] - - [877, 9357.49] + - [880, 9357.49] - - [4096, 3955, 1, 1024] - - [876, 9589.61] + - [879, 9589.61] - - [1024, 3441, 1, 4096] - - [886, 8359.17] + - [889, 8359.17] - - [1024, 3236, 1, 4096] - - [900, 8022.5] + - [903, 8022.5] - - [1024, 3524, 1, 4096] - - [895, 8477.14] + - [898, 8477.14] - - [4096, 3460, 1, 1024] - - [875, 9581.86] + - [878, 9581.86] - - [16, 16, 3840, 64] - - [907, 1270.49] + - [910, 1270.49] - - [92, 93, 688, 64] - - [911, 4962.3] + - [914, 4962.3] - - [1024, 3384, 1, 4096] - - [886, 8409.29] + - [889, 8409.29] - - [4096, 3387, 1, 1024] - - [877, 9379.7] + - [880, 9379.7] - - [4096, 3436, 1, 1024] - - [875, 9491.83] + - [878, 9491.83] - - [4096, 3277, 1, 1024] - - [875, 9717.17] + - [878, 9717.17] - - [1024, 3457, 1, 4096] - - [895, 8279.12] + - [898, 8279.12] - - [1024, 3999, 1, 4096] - - [870, 9231.37] + - [873, 9231.37] - - [1024, 4032, 1, 4096] - - [879, 9443.52] + - [882, 9443.52] - - [4096, 3541, 1, 1024] - - [875, 9773.14] + - [878, 9773.14] - - [4096, 3334, 1, 1024] - - [875, 9242.69] + - [878, 9242.69] - - [1024, 3393, 1, 4096] - - [897, 8376.07] + - [900, 8376.07] - - [17, 17, 3632, 64] - - [919, 1425.67] + - [922, 1425.67] - - [1024, 3411, 1, 4096] - - [885, 8490.87] + - [888, 8490.87] - - [1024, 3822, 1, 1024] - - [882, 8773.34] + - [885, 8773.34] - - [1024, 3593, 1, 4096] - - [882, 8571.15] + - [885, 8571.15] - - [33708, 3822, 1, 1024] - - [876, 10056.7] + - [879, 10056.7] - - [4096, 3504, 1, 1024] - - [878, 9680.19] + - [881, 9680.19] - - [1024, 3163, 1, 4096] - - [897, 8014.33] + - [900, 8014.33] - - [1024, 3357, 1, 4096] - - [898, 8375.94] + - [901, 8375.94] - - [1024, 3906, 1, 4096] - - [879, 9108.12] + - [882, 9108.12] - - [4096, 3415, 1, 1024] - - [875, 9443.77] + - [878, 9443.77] - - [1024, 3406, 1, 4096] - - [898, 8451.54] + - [901, 8451.54] - - [4096, 3321, 1, 1024] - - [877, 9836.52] + - [880, 9836.52] - - [4096, 3584, 1, 1024] - - [878, 9915.83] + - [881, 9915.83] - - [1024, 2736, 1, 4096] - - [881, 8532.83] + - [884, 8532.83] - - [1024, 3110, 1, 4096] - - [898, 7889.19] + - [901, 7889.19] - - [33708, 3999, 1, 1024] - - [878, 9903.23] + - [881, 9903.23] - - [1024, 3093, 1, 4096] - - [896, 7919.25] + - [899, 7919.25] - - [4096, 3378, 1, 1024] - - [878, 9362.2] + - [881, 9362.2] - - [1024, 3543, 1, 4096] - - [892, 8438.06] + - [895, 8438.06] - - [33708, 3925, 1, 1024] - - [877, 10021.5] + - [880, 10021.5] - - [1024, 3352, 1, 4096] - - [898, 8333.72] + - [901, 8333.72] - - [4096, 3780, 1, 1024] - - [875, 9754.92] + - [878, 9754.92] - - [1024, 3990, 1, 4096] - - [872, 9250.92] + - [875, 9250.92] - - [4096, 3500, 1, 1024] - - [875, 9673.73] + - [878, 9673.73] - - [4096, 3996, 1, 1024] - - [876, 9694.4] + - [879, 9694.4] - - [1024, 3247, 1, 4096] - - [901, 8171.48] + - [904, 8171.48] - - [4096, 3395, 1, 1024] - - [877, 9391.94] + - [880, 9391.94] - - [1024, 3169, 1, 4096] - - [896, 7990.14] + - [899, 7990.14] - - [1024, 3088, 1, 4096] - - [896, 7890.26] + - [899, 7890.26] - - [1024, 3584, 1, 4096] - - [898, 8604.1] + - [901, 8604.1] - - [4096, 3093, 1, 1024] - - [877, 9224.78] + - [880, 9224.78] - - [1024, 3538, 1, 4096] - - [879, 8395.64] + - [882, 8395.64] - - [1024, 3996, 1, 1024] - - [880, 9208.23] + - [883, 9208.23] - - [1024, 3581, 1, 4096] - - [892, 8523.14] + - [895, 8523.14] - - [4096, 3374, 1, 1024] - - [877, 9342.71] + - [880, 9342.71] - - [33708, 3751, 1, 1024] - - [877, 9881.89] + - [880, 9881.89] - - [59, 59, 1088, 64] - - [915, 4515.44] + - [918, 4515.44] - - [4096, 3215, 1, 1024] - - [877, 9557.65] + - [880, 9557.65] - - [4096, 3312, 1, 1024] - - [875, 9834.3] + - [878, 9834.3] - - [4096, 3581, 1, 1024] - - [877, 9856.56] + - [880, 9856.56] - - [4096, 3479, 1, 1024] - - [877, 9620.25] + - [880, 9620.25] - - [4096, 3544, 1, 1024] - - [875, 9778.84] + - [878, 9778.84] - - [1024, 3870, 1, 1024] - - [880, 8935.16] + - [883, 8935.16] - - [1024, 3374, 1, 4096] - - [897, 8412.75] + - [900, 8412.75] - - [1024, 2967, 1, 4096] - - [880, 8982.87] + - [883, 8982.87] - - [41, 41, 1552, 64] - - [909, 2805.28] + - [912, 2805.28] - - [4096, 3455, 1, 1024] - - [875, 9538.79] + - [878, 9538.79] - - [4096, 3942, 1, 1024] - - [876, 9554.55] + - [879, 9554.55] - - [1024, 3528, 1, 4096] - - [895, 8438.37] + - [898, 8438.37] - - [4096, 3186, 1, 1024] - - [876, 9468.22] + - [879, 9468.22] - - [1024, 3976, 1, 1024] - - [880, 9166.98] + - [883, 9166.98] - - [1024, 3511, 1, 4096] - - [882, 8334.96] + - [885, 8334.96] - - [4096, 3573, 1, 1024] - - [875, 9855.23] + - [878, 9855.23] - - [4096, 3561, 1, 1024] - - [875, 9830.93] + - [878, 9830.93] - - [4096, 3418, 1, 1024] - - [876, 9450.58] + - [879, 9450.58] - - [33708, 3906, 1, 1024] - - [878, 9973.57] + - [881, 9973.57] - - [4096, 3259, 1, 1024] - - [875, 9685.16] + - [878, 9685.16] - - [4096, 3308, 1, 1024] - - [877, 9791.93] + - [880, 9791.93] - - [1024, 3419, 1, 4096] - - [897, 8514.43] + - [900, 8514.43] - - [1024, 3215, 1, 4096] - - [896, 8137.43] + - [899, 8137.43] - - [1024, 4030, 1, 4096] - - [878, 9290.66] + - [881, 9290.66] - - [4096, 3459, 1, 1024] - - [875, 9567.47] + - [878, 9567.47] - - [1024, 3572, 1, 4096] - - [895, 8501.33] + - [898, 8501.33] - - [1024, 3137, 1, 4096] - - [897, 7930.05] + - [900, 7930.05] - - [1024, 3312, 1, 4096] - - [898, 8378.5] + - [901, 8378.5] - - [1024, 3925, 1, 4096] - - [880, 9255.76] + - [883, 9255.76] - - [1024, 3453, 1, 4096] - - [897, 8630.66] + - [900, 8630.66] - - [4096, 3435, 1, 1024] - - [876, 9495.08] + - [879, 9495.08] - - [1024, 3176, 1, 4096] - - [897, 8087.13] + - [900, 8087.13] - - [1024, 3444, 1, 4096] - - [889, 8528.48] + - [892, 8528.48] - - [4096, 3975, 1, 1024] - - [878, 9645.24] + - [881, 9645.24] - - [4096, 3182, 1, 1024] - - [877, 9448.3] + - [880, 9448.3] - - [1024, 3475, 1, 4096] - - [896, 8404.77] + - [899, 8404.77] - - [9, 9, 6544, 64] - - [911, 425.754] + - [914, 425.754] - - [33708, 3955, 1, 1024] - - [878, 10088.3] + - [881, 10088.3] - - [4096, 3446, 1, 1024] - - [877, 9519.96] + - [880, 9519.96] - - [1024, 3138, 1, 4096] - - [896, 8053.34] + - [899, 8053.34] - - [1024, 3549, 1, 4096] - - [882, 8426.32] + - [885, 8426.32] - - [4096, 3287, 1, 1024] - - [878, 9751.24] + - [881, 9751.24] - - [1024, 3342, 1, 4096] - - [895, 8319.91] + - [898, 8319.91] - - [102, 102, 624, 64] - - [910, 4747.42] + - [913, 4747.42] - - [4096, 3519, 1, 1024] - - [877, 9716.0] + - [880, 9716.0] - - [4096, 3552, 1, 1024] - - [875, 9806.59] + - [878, 9806.59] - - [4096, 3859, 1, 1024] - - [875, 9369.84] + - [878, 9369.84] - - [33708, 3969, 1, 1024] - - [875, 9830.29] + - [878, 9830.29] - - [1024, 3369, 1, 4096] - - [896, 8379.16] + - [899, 8379.16] - - [4096, 3482, 1, 1024] - - [875, 9631.6] + - [878, 9631.6] - - [1024, 3306, 1, 4096] - - [898, 8319.96] + - [901, 8319.96] - - [1024, 3474, 1, 4096] - - [897, 8498.8] + - [900, 8498.8] - - [99, 99, 624, 64] - - [909, 4492.8] + - [912, 4492.8] - - [4096, 3377, 1, 1024] - - [875, 9369.82] + - [878, 9369.82] - - [4096, 3426, 1, 1024] - - [875, 9467.2] + - [878, 9467.2] - - [4096, 2935, 1, 1024] - - [876, 9423.64] + - [879, 9423.64] - - [4096, 3267, 1, 1024] - - [875, 9697.94] + - [878, 9697.94] - - [1024, 3299, 1, 4096] - - [896, 8264.66] + - [899, 8264.66] - - [1024, 3456, 1, 4096] - - [895, 8678.29] + - [898, 8678.29] - - [1024, 3280, 1, 4096] - - [896, 8220.59] + - [899, 8220.59] - - [1024, 3555, 1, 4096] - - [895, 8656.17] + - [898, 8656.17] - - [4096, 3499, 1, 1024] - - [877, 9663.83] + - [880, 9663.83] - - [4096, 3356, 1, 1024] - - [877, 9296.8] + - [880, 9296.8] - - [100, 102, 624, 64] - - [910, 4671.41] + - [913, 4671.41] - - [1024, 3412, 1, 4096] - - [898, 8537.95] + - [901, 8537.95] - - [1024, 2984, 1, 4096] - - [881, 9193.07] + - [884, 9193.07] - - [4096, 3141, 1, 1024] - - [877, 9349.33] + - [880, 9349.33] - - [4096, 3510, 1, 1024] - - [875, 9701.88] + - [878, 9701.88] - - [1024, 3995, 1, 1024] - - [879, 9243.3] + - [882, 9243.3] - - [1024, 3517, 1, 4096] - - [897, 8569.21] + - [900, 8569.21] - - [1024, 3455, 1, 4096] - - [897, 8560.57] + - [900, 8560.57] - - [1024, 3939, 1, 1024] - - [880, 9030.84] + - [883, 9030.84] - - [38, 38, 1680, 64] - - [909, 2459.74] + - [912, 2459.74] - - [1024, 3447, 1, 4096] - - [895, 8609.92] + - [898, 8609.92] - - [1024, 3969, 1, 4096] - - [882, 9097.23] + - [885, 9097.23] - - [4096, 3527, 1, 1024] - - [877, 9743.73] + - [880, 9743.73] - - [4096, 3336, 1, 1024] - - [877, 9248.23] + - [880, 9248.23] - - [1024, 3191, 1, 4096] - - [895, 8104.86] + - [898, 8104.86] - - [1024, 3302, 1, 4096] - - [896, 8244.99] + - [899, 8244.99] - - [1024, 3337, 1, 4096] - - [898, 8254.15] + - [901, 8254.15] - - [4096, 3290, 1, 1024] - - [877, 9759.03] + - [880, 9759.03] - - [1024, 3512, 1, 4096] - - [886, 8640.96] + - [889, 8640.96] - - [1024, 3433, 1, 4096] - - [896, 8444.6] + - [899, 8444.6] - - [4096, 3876, 1, 1024] - - [876, 9420.28] + - [879, 9420.28] - - [4096, 3490, 1, 1024] - - [877, 9641.01] + - [880, 9641.01] - - [4096, 3064, 1, 1024] - - [877, 9820.39] + - [880, 9820.39] - - [1024, 3508, 1, 4096] - - [892, 8442.14] + - [895, 8442.14] - - [1024, 3956, 1, 4096] - - [877, 9128.09] + - [880, 9128.09] - - [4096, 3417, 1, 1024] - - [877, 9448.31] + - [880, 9448.31] - - [1024, 3248, 1, 4096] - - [896, 8006.06] + - [899, 8006.06] - - [1024, 2499, 1, 4096] - - [896, 8155.09] + - [899, 8155.09] - - [1024, 3186, 1, 4096] - - [896, 8092.94] + - [899, 8092.94] - - [1024, 3180, 1, 4096] - - [898, 8096.92] + - [901, 8096.92] - - [4096, 3364, 1, 1024] - - [877, 9317.98] + - [880, 9317.98] - - [4096, 3976, 1, 1024] - - [877, 9654.37] + - [880, 9654.37] - - [4096, 3205, 1, 1024] - - [878, 9538.74] + - [881, 9538.74] - - [4096, 3318, 1, 1024] - - [875, 9838.19] + - [878, 9838.19] - - [1024, 3377, 1, 4096] - - [898, 8445.54] + - [901, 8445.54] - - [1024, 3485, 1, 4096] - - [895, 8368.73] + - [898, 8368.73] - - [4096, 3181, 1, 1024] - - [878, 9458.19] + - [881, 9458.19] - - [4096, 3550, 1, 1024] - - [875, 9783.04] + - [878, 9783.04] - - [1024, 3534, 1, 4096] - - [884, 8684.89] + - [887, 8684.89] - - [1024, 3860, 1, 1024] - - [879, 8923.08] + - [882, 8923.08] - - [160, 160, 400, 64] - - [922, 5797.59] + - [925, 5797.59] - - [4096, 3445, 1, 1024] - - [877, 9511.18] + - [880, 9511.18] - - [1024, 3391, 1, 4096] - - [898, 8541.67] + - [901, 8541.67] - - [1024, 3221, 1, 4096] - - [896, 8055.4] + - [899, 8055.4] - - [4096, 3079, 1, 1024] - - [875, 9180.94] + - [878, 9180.94] - - [4096, 3144, 1, 1024] - - [877, 9351.35] + - [880, 9351.35] - - [1024, 3270, 1, 4096] - - [897, 8367.53] + - [900, 8367.53] - - [1024, 3561, 1, 4096] - - [897, 8426.19] + - [900, 8426.19] - - [1024, 3480, 1, 4096] - - [884, 8464.9] + - [887, 8464.9] - - [4096, 3408, 1, 1024] - - [877, 9419.94] + - [880, 9419.94] - - [1024, 3418, 1, 4096] - - [898, 8480.92] + - [901, 8480.92] - - [4096, 3298, 1, 1024] - - [878, 9788.3] + - [881, 9788.3] - - [1024, 3640, 1, 1024] - - [881, 8435.34] + - [884, 8435.34] - - [1024, 3449, 1, 4096] - - [896, 8590.77] + - [899, 8590.77] - - [1024, 4020, 1, 4096] - - [874, 9168.03] + - [877, 9168.03] - - [4096, 3481, 1, 1024] - - [875, 9627.81] + - [878, 9627.81] - - [4096, 3530, 1, 1024] - - [877, 9734.58] + - [880, 9734.58] - - [1024, 3216, 1, 4096] - - [898, 8014.22] + - [901, 8014.22] - - [1024, 3840, 1, 1024] - - [881, 8908.27] + - [884, 8908.27] - - [1024, 3491, 1, 4096] - - [884, 8410.49] + - [887, 8410.49] - - [1024, 3154, 1, 4096] - - [897, 8095.59] + - [900, 8095.59] - - [4096, 3425, 1, 1024] - - [877, 9474.43] + - [880, 9474.43] - - [1024, 3348, 1, 4096] - - [895, 8202.8] + - [898, 8202.8] - - [1024, 3415, 1, 4096] - - [896, 8597.58] + - [899, 8597.58] - - [1024, 4026, 1, 1024] - - [879, 9278.99] + - [882, 9278.99] - - [1024, 3367, 1, 4096] - - [898, 8335.44] + - [901, 8335.44] - - [1024, 3259, 1, 4096] - - [898, 8285.2] + - [901, 8285.2] - - [1024, 3894, 1, 4096] - - [881, 9040.34] + - [884, 9040.34] - - [4096, 3355, 1, 1024] - - [876, 9291.57] + - [879, 9291.57] - - [4096, 3404, 1, 1024] - - [877, 9410.37] + - [880, 9410.37] - - [1024, 3308, 1, 4096] - - [898, 8336.2] + - [901, 8336.2] - - [4096, 3245, 1, 1024] - - [876, 9641.37] + - [879, 9641.37] - - [1024, 3502, 1, 4096] - - [897, 8375.8] + - [900, 8375.8] - - [33708, 4032, 1, 1024] - - [876, 9988.1] + - [879, 9988.1] - - [8, 8, 7280, 64] - - [913, 339.778] + - [916, 339.778] - - [1024, 3424, 1, 4096] - - [884, 8489.38] + - [887, 8489.38] - - [4096, 3509, 1, 1024] - - [876, 9702.19] + - [879, 9702.19] - - [4096, 3558, 1, 1024] - - [877, 9815.41] + - [880, 9815.41] - - [1024, 3900, 1, 1024] - - [880, 9013.95] + - [883, 9013.95] - - [1024, 2505, 1, 4096] - - [894, 8263.65] + - [897, 8263.65] - - [4096, 3472, 1, 1024] - - [875, 9609.51] + - [878, 9609.51] - - [1024, 3386, 1, 4096] - - [895, 8417.45] + - [898, 8417.45] - - [4096, 3383, 1, 1024] - - [877, 9364.67] + - [880, 9364.67] - - [4096, 3448, 1, 1024] - - [878, 9520.97] + - [881, 9520.97] - - [4096, 4030, 1, 1024] - - [878, 9771.46] + - [881, 9771.46] - - [4096, 3289, 1, 1024] - - [875, 9757.17] + - [878, 9757.17] - - [1024, 3459, 1, 4096] - - [897, 8422.02] + - [900, 8422.02] - - [1024, 2918, 1, 4096] - - [882, 9022.61] + - [885, 9022.61] - - [4096, 3489, 1, 1024] - - [875, 9641.8] + - [878, 9641.8] - - [4096, 3346, 1, 1024] - - [877, 9271.55] + - [880, 9271.55] - - [4096, 3572, 1, 1024] - - [877, 9829.72] + - [880, 9829.72] - - [1024, 3955, 1, 4096] - - [878, 9221.56] + - [881, 9221.56] - - [4096, 3236, 1, 1024] - - [875, 9620.62] + - [878, 9620.62] - - [4096, 3163, 1, 1024] - - [875, 9397.2] + - [878, 9397.2] - - [4096, 3468, 1, 1024] - - [875, 9601.48] + - [878, 9601.48] - - [1024, 3165, 1, 4096] - - [897, 7941.48] + - [900, 7941.48] - - [1024, 3276, 1, 4096] - - [897, 8244.86] + - [900, 8244.86] - - [1024, 3359, 1, 4096] - - [895, 8273.83] + - [898, 8273.83] - - [4096, 3363, 1, 1024] - - [877, 9315.7] + - [880, 9315.7] - - [1024, 3385, 1, 4096] - - [889, 8286.1] + - [892, 8286.1] - - [1024, 3207, 1, 4096] - - [898, 8143.92] + - [901, 8143.92] - - [1024, 3458, 1, 4096] - - [897, 8472.31] + - [900, 8472.31] - - [21, 21, 2976, 64] - - [913, 2083.2] + - [916, 2083.2] - - [4096, 3110, 1, 1024] - - [875, 9260.2] + - [878, 9260.2] - - [4096, 3925, 1, 1024] - - [878, 9526.56] + - [881, 9526.56] - - [1024, 3975, 1, 4096] - - [873, 9133.74] + - [876, 9133.74] - - [4096, 3549, 1, 1024] - - [877, 9793.67] + - [880, 9793.67] - - [4096, 3342, 1, 1024] - - [876, 9264.38] + - [879, 9264.38] - - [1024, 3859, 1, 1024] - - [879, 8933.37] + - [882, 8933.37] - - [1024, 3497, 1, 4096] - - [896, 8526.03] + - [899, 8526.03] - - [4096, 3280, 1, 1024] - - [877, 9733.22] + - [880, 9733.22] - - [1024, 3435, 1, 4096] - - [896, 8489.75] + - [899, 8489.75] - - [1024, 3354, 1, 4096] - - [896, 8248.73] + - [899, 8248.73] - - [4096, 3191, 1, 1024] - - [876, 9475.02] + - [879, 9475.02] - - [4096, 3512, 1, 1024] - - [875, 9701.27] + - [878, 9701.27] - - [1024, 3055, 1, 4096] - - [882, 9264.81] + - [885, 9264.81] - - [4096, 2499, 1, 1024] - - [877, 9573.96] + - [880, 9573.96] - - [1024, 3233, 1, 4096] - - [895, 8101.64] + - [898, 8101.64] - - [4096, 3423, 1, 1024] - - [878, 9463.4] + - [881, 9463.4] - - [1024, 3319, 1, 4096] - - [898, 8413.66] + - [901, 8413.66] - - [4096, 3297, 1, 1024] - - [875, 9782.56] + - [878, 9782.56] - - [4096, 3154, 1, 1024] - - [877, 9381.1] + - [880, 9381.1] - - [1024, 3540, 1, 4096] - - [898, 8507.43] + - [901, 8507.43] - - [1024, 3289, 1, 4096] - - [898, 8233.7] + - [901, 8233.7] - - [4096, 3529, 1, 1024] - - [877, 9741.05] + - [880, 9741.05] - - [4096, 3386, 1, 1024] - - [877, 9372.47] + - [880, 9372.47] - - [4096, 3276, 1, 1024] - - [875, 9713.66] + - [878, 9713.66] - - [1024, 3244, 1, 4096] - - [898, 8146.73] + - [901, 8146.73] - - [1024, 3182, 1, 4096] - - [895, 8115.02] + - [898, 8115.02] - - [4096, 3540, 1, 1024] - - [875, 9768.32] + - [878, 9768.32] - - [1024, 3360, 1, 4096] - - [897, 8353.21] + - [900, 8353.21] - - [1024, 3942, 1, 4096] - - [876, 9143.68] + - [879, 9143.68] - - [4096, 3403, 1, 1024] - - [878, 9412.08] + - [881, 9412.08] - - [4096, 3101, 1, 1024] - - [878, 9239.18] + - [881, 9239.18] - - [4096, 2918, 1, 1024] - - [877, 9373.65] + - [880, 9373.65] - - [1024, 3465, 1, 4096] - - [898, 8288.06] + - [901, 8288.06] - - [33708, 3780, 1, 1024] - - [877, 9971.81] + - [880, 9971.81] - - [4096, 3557, 1, 1024] - - [875, 9814.72] + - [878, 9814.72] - - [4096, 3414, 1, 1024] - - [875, 9436.53] + - [878, 9436.53] - - [1024, 3948, 1, 1024] - - [879, 9073.7] + - [882, 9073.7] - - [4096, 3320, 1, 1024] - - [877, 9834.67] + - [880, 9834.67] - - [4096, 2765, 1, 1024] - - [877, 9666.96] + - [880, 9666.96] - - [1024, 3978, 1, 4096] - - [872, 9109.5] + - [875, 9109.5] - - [4096, 3487, 1, 1024] - - [875, 9643.9] + - [878, 9643.9] - - [4096, 3520, 1, 1024] - - [877, 9727.98] + - [880, 9727.98] - - [1024, 3139, 1, 4096] - - [897, 7940.09] + - [900, 7940.09] - - [1024, 3314, 1, 4096] - - [895, 8293.91] + - [898, 8293.91] - - [4096, 3431, 1, 1024] - - [877, 9482.02] + - [880, 9482.02] - - [123, 122, 528, 64] - - [910, 6325.88] + - [913, 6325.88] - - [1024, 3446, 1, 4096] - - [891, 8468.24] + - [894, 8468.24] - - [1024, 4059, 1, 4096] - - [878, 9370.7] + - [881, 9370.7] - - [99, 102, 624, 64] - - [910, 4624.7] + - [913, 4624.7] - - [4096, 3345, 1, 1024] - - [875, 9271.22] + - [878, 9271.22] - - [4096, 3394, 1, 1024] - - [875, 9398.09] + - [878, 9398.09] - - [1024, 3927, 1, 1024] - - [880, 9041.28] + - [883, 9041.28] - - [4096, 3235, 1, 1024] - - [875, 9619.83] + - [878, 9619.83] - - [1024, 3328, 1, 4096] - - [896, 8405.99] + - [899, 8405.99] - - [33708, 3956, 1, 1024] - - [876, 10100.3] + - [879, 10100.3] - - [4096, 3467, 1, 1024] - - [877, 9586.56] + - [880, 9586.56] - - [1024, 3287, 1, 4096] - - [897, 8273.73] + - [900, 8273.73] - - [4096, 3214, 1, 1024] - - [878, 9557.39] + - [881, 9557.39] - - [4096, 3910, 1, 1024] - - [875, 9490.15] + - [878, 9490.15] - - [1024, 3780, 1, 1024] - - [882, 8705.9] + - [885, 8705.9] - - [1024, 3371, 1, 4096] - - [898, 8248.36] + - [901, 8248.36] - - [4096, 3478, 1, 1024] - - [878, 9619.52] + - [881, 9619.52] - - [1024, 3546, 1, 4096] - - [896, 8456.73] + - [899, 8456.73] - - [1024, 4012, 1, 1024] - - [879, 9253.24] + - [882, 9253.24] - - [4096, 3341, 1, 1024] - - [877, 9260.14] + - [880, 9260.14] - - [4096, 3454, 1, 1024] - - [875, 9533.52] + - [878, 9533.52] - - [4096, 3295, 1, 1024] - - [878, 9772.76] + - [881, 9772.76] - - [4096, 3072, 1, 1024] - - [875, 9887.13] + - [878, 9887.13] - - [1024, 3282, 1, 4096] - - [883, 8112.75] + - [886, 8112.75] - - [33708, 3720, 1, 1024] - - [878, 9818.75] + - [881, 9818.75] - - [1024, 3681, 1, 4096] - - [880, 8639.18] + - [883, 8639.18] - - [1024, 4050, 1, 4096] - - [878, 9291.83] + - [881, 9291.83] - - [4096, 3495, 1, 1024] - - [877, 9660.42] + - [880, 9660.42] - - [4096, 3560, 1, 1024] - - [876, 9813.7] + - [879, 9813.7] - - [4096, 3751, 1, 1024] - - [875, 9684.85] + - [878, 9684.85] - - [1024, 3414, 1, 4096] - - [896, 8555.62] + - [899, 8555.62] - - [33708, 3860, 1, 1024] - - [875, 9856.58] + - [878, 9856.58] - - [1024, 3325, 1, 4096] - - [885, 8261.11] + - [888, 8261.11] - - [4096, 3458, 1, 1024] - - [875, 9570.76] + - [878, 9570.76] - - [4096, 2967, 1, 1024] - - [875, 9544.51] + - [878, 9544.51] - - [1024, 3519, 1, 4096] - - [898, 8413.0] + - [901, 8413.0] - - [4096, 3385, 1, 1024] - - [877, 9367.24] + - [880, 9367.24] - - [4096, 3434, 1, 1024] - - [875, 9488.31] + - [878, 9488.31] - - [1024, 3552, 1, 4096] - - [896, 8456.03] + - [899, 8456.03] - - [4096, 3822, 1, 1024] - - [876, 9849.74] + - [879, 9849.74] - - [1024, 3544, 1, 4096] - - [895, 8494.46] + - [898, 8494.46] - - [4096, 3539, 1, 1024] - - [877, 9762.99] + - [880, 9762.99] - - [4096, 3332, 1, 1024] - - [875, 9232.26] + - [878, 9232.26] - - [1024, 3145, 1, 4096] - - [895, 8098.26] + - [898, 8098.26] - - [1024, 3535, 1, 4096] - - [883, 8592.7] + - [886, 8592.7] - - [1024, 3320, 1, 4096] - - [896, 8419.45] + - [899, 8419.45] - - [33708, 4012, 1, 1024] - - [878, 9940.1] + - [881, 9940.1] - - [4096, 3286, 1, 1024] - - [877, 9747.72] + - [880, 9747.72] - - [1024, 3514, 1, 4096] - - [896, 8653.59] + - [899, 8653.59] - - [93, 93, 688, 64] - - [917, 5005.69] + - [920, 5005.69] - - [1024, 2765, 1, 4096] - - [882, 8636.62] + - [885, 8636.62] - - [1024, 3452, 1, 4096] - - [895, 8445.77] + - [898, 8445.77] - - [4096, 3518, 1, 1024] - - [875, 9722.46] + - [878, 9722.46] - - [1024, 3529, 1, 4096] - - [895, 8444.22] + - [898, 8444.22] - - [4096, 3413, 1, 1024] - - [875, 9436.25] + - [878, 9436.25] - - [33708, 4050, 1, 1024] - - [877, 10026.6] + - [880, 10026.6] - - [1024, 3525, 1, 4096] - - [888, 8488.89] + - [891, 8488.89] - - [4096, 3303, 1, 1024] - - [875, 9790.95] + - [878, 9790.95] - - [1024, 3382, 1, 4096] - - [896, 8483.53] + - [899, 8483.53] - - [1024, 3390, 1, 4096] - - [895, 8552.71] + - [898, 8552.71] - - [1024, 3977, 1, 4096] - - [877, 9053.43] + - [880, 9053.43] - - [1024, 3184, 1, 4096] - - [895, 8008.71] + - [898, 8008.71] - - [4096, 3535, 1, 1024] - - [877, 9760.69] + - [880, 9760.69] - - [4096, 3376, 1, 1024] - - [878, 9341.83] + - [881, 9341.83] - - [4096, 3978, 1, 1024] - - [878, 9642.7] + - [881, 9642.7] - - [1024, 3136, 1, 4096] - - [897, 8085.02] + - [900, 8085.02] - - [1024, 3293, 1, 4096] - - [895, 8300.39] + - [898, 8300.39] - - [4096, 3266, 1, 1024] - - [876, 9691.68] + - [879, 9691.68] - - [1024, 3487, 1, 4096] - - [895, 8383.52] + - [898, 8383.52] - - [1024, 3409, 1, 4096] - - [897, 8493.15] + - [900, 8493.15] - - [4096, 3498, 1, 1024] - - [876, 9672.28] + - [879, 9672.28] - - [1024, 3520, 1, 4096] - - [898, 8488.16] + - [901, 8488.16] - - [1024, 3530, 1, 4096] - - [879, 8409.77] + - [882, 8409.77] - - [4096, 3393, 1, 1024] - - [877, 9395.33] + - [880, 9395.33] - - [4096, 3140, 1, 1024] - - [877, 9338.4] + - [880, 9338.4] - - [1024, 3536, 1, 4096] - - [898, 8642.01] + - [901, 8642.01] - - [1024, 3288, 1, 4096] - - [898, 8229.24] + - [901, 8229.24] - - [1024, 4005, 1, 4096] - - [880, 9270.94] + - [883, 9270.94] - - [1024, 3579, 1, 4096] - - [884, 8844.4] + - [887, 8844.4] - - [4096, 3372, 1, 1024] - - [875, 9339.15] + - [878, 9339.15] - - [1024, 3440, 1, 4096] - - [895, 8466.59] + - [898, 8466.59] - - [4096, 3213, 1, 1024] - - [878, 9558.75] + - [881, 9558.75] - - [123, 123, 528, 64] - - [910, 6333.49] + - [913, 6333.49] - - [100, 100, 624, 64] - - [909, 4584.02] + - [912, 4584.02] - - [1024, 3968, 1, 4096] - - [876, 9237.5] + - [879, 9237.5] - - [4096, 3477, 1, 1024] - - [876, 9618.78] + - [879, 9618.78] - - [4096, 3526, 1, 1024] - - [875, 9735.84] + - [878, 9735.84] - - [1024, 3493, 1, 4096] - - [896, 8355.03] + - [899, 8355.03] - - [1024, 3944, 1, 4096] - - [871, 9065.29] + - [874, 9065.29] - - [4096, 3453, 1, 1024] - - [876, 9533.27] + - [879, 9533.27] - - [1024, 3350, 1, 4096] - - [898, 8448.54] + - [901, 8448.54] - - [4096, 3184, 1, 1024] - - [877, 9447.28] + - [880, 9447.28] - - [1024, 3423, 1, 4096] - - [896, 8465.28] + - [899, 8465.28] - - [4096, 3351, 1, 1024] - - [875, 9281.96] + - [878, 9281.96] - - [4096, 3416, 1, 1024] - - [875, 9446.54] + - [878, 9446.54] - - [1024, 3796, 1, 4096] - - [877, 8820.24] + - [880, 8820.24] - - [4096, 3257, 1, 1024] - - [875, 9671.54] + - [878, 9671.54] - - [4096, 3306, 1, 1024] - - [877, 9795.41] + - [880, 9795.41] - - [33708, 4020, 1, 1024] - - [877, 9961.75] + - [880, 9961.75] - - [19, 19, 3264, 64] - - [907, 1735.99] + - [910, 1735.99] - - [1024, 3426, 1, 4096] - - [895, 8518.51] + - [898, 8518.51] - - [4096, 3457, 1, 1024] - - [875, 9564.46] + - [878, 9564.46] - - [1024, 2935, 1, 4096] - - [880, 9067.69] + - [883, 9067.69] - - [1024, 3046, 1, 4096] - - [880, 9242.87] + - [883, 9242.87] - - [4096, 3433, 1, 1024] - - [877, 9495.55] + - [880, 9495.55] - - [1024, 3256, 1, 4096] - - [898, 8224.13] + - [901, 8224.13] - - [1024, 3531, 1, 4096] - - [895, 8524.09] + - [898, 8524.09] - - [4096, 3180, 1, 1024] - - [875, 9443.43] + - [878, 9443.43] - - [1024, 3388, 1, 4096] - - [897, 8352.72] + - [900, 8352.72] - - [4096, 3444, 1, 1024] - - [878, 9510.93] + - [881, 9510.93] - - [1024, 3501, 1, 4096] - - [885, 8461.02] + - [888, 8461.02] - - [1024, 3266, 1, 4096] - - [883, 8147.34] + - [886, 8147.34] - - [1024, 3267, 1, 4096] - - [898, 8391.39] + - [901, 8391.39] - - [1024, 3461, 1, 4096] - - [882, 8270.19] + - [885, 8270.19] - - [4096, 3870, 1, 1024] - - [877, 9399.59] + - [880, 9399.59] - - [4096, 3517, 1, 1024] - - [875, 9725.33] + - [878, 9725.33] - - [1024, 3566, 1, 4096] - - [898, 8669.66] + - [901, 8669.66] - - [4096, 3574, 1, 1024] - - [875, 9844.53] + - [878, 9844.53] - - [1024, 3876, 1, 1024] - - [880, 8961.64] + - [883, 8961.64] - - [25, 25, 2512, 64] - - [906, 2472.44] + - [909, 2472.44] - - [4096, 3720, 1, 1024] - - [875, 9612.39] + - [878, 9612.39] - - [4096, 3248, 1, 1024] - - [877, 9644.82] + - [880, 9644.82] - - [4096, 4059, 1, 1024] - - [875, 9826.32] + - [878, 9826.32] - - [1024, 3380, 1, 4096] - - [896, 8677.81] + - [899, 8677.81] - - [4096, 3480, 1, 1024] - - [877, 9626.06] + - [880, 9626.06] - - [1024, 3335, 1, 4096] - - [897, 8302.08] + - [900, 8302.08] - - [1024, 3345, 1, 4096] - - [897, 8323.03] + - [900, 8323.03] - - [4096, 3391, 1, 1024] - - [875, 9379.38] + - [878, 9379.38] - - [4096, 3424, 1, 1024] - - [877, 9466.67] + - [880, 9466.67] - - [1024, 3394, 1, 4096] - - [883, 8373.81] + - [886, 8373.81] - - [4096, 3265, 1, 1024] - - [877, 9700.79] + - [880, 9700.79] - - [1024, 3014, 1, 4096] - - [880, 9302.99] + - [883, 9302.99] - - [4096, 3497, 1, 1024] - - [875, 9668.5] + - [878, 9668.5] - - [4096, 3354, 1, 1024] - - [877, 9294.21] + - [880, 9294.21] - - [4096, 3055, 1, 1024] - - [876, 9780.78] + - [879, 9780.78] - - [1024, 3499, 1, 4096] - - [889, 8526.94] + - [892, 8526.94] - - [1024, 3162, 1, 4096] - - [897, 8058.92] + - [900, 8058.92] - - [4096, 3244, 1, 1024] - - [877, 9636.76] + - [880, 9636.76] - - [1024, 3437, 1, 4096] - - [896, 8583.31] + - [899, 8583.31] - - [1024, 3356, 1, 4096] - - [898, 8296.85] + - [901, 8296.85] - - [4096, 3139, 1, 1024] - - [877, 9338.6] + - [880, 9338.6] - - [4096, 3508, 1, 1024] - - [877, 9700.44] + - [880, 9700.44] - - [1024, 3235, 1, 4096] - - [895, 8314.49] + - [898, 8314.49] - - [1024, 3910, 1, 4096] - - [882, 9200.11] + - [885, 9200.11] - - [4096, 3371, 1, 1024] - - [875, 9336.87] + - [878, 9336.87] - - [1024, 3751, 1, 4096] - - [882, 8827.57] + - [885, 8827.57] - - [4096, 3325, 1, 1024] - - [875, 9845.58] + - [878, 9845.58] - - [1024, 3413, 1, 4096] - - [883, 8345.68] + - [886, 8345.68] - - [1024, 3542, 1, 4096] - - [895, 8521.61] + - [898, 8521.61] - - [18, 18, 3440, 64] - - [911, 1578.14] + - [914, 1578.14] - - [101, 102, 624, 64] - - [909, 4705.18] + - [912, 4705.18] - - [33708, 3900, 1, 1024] - - [875, 9950.95] + - [878, 9950.95] - - [4096, 3525, 1, 1024] - - [876, 9744.37] + - [879, 9744.37] - - [4096, 3382, 1, 1024] - - [876, 9358.93] + - [879, 9358.93] - - [102, 100, 624, 64] - - [910, 4671.41] + - [913, 4671.41] - - [15, 15, 4096, 64] - - [914, 1129.07] + - [917, 1129.07] - - [1024, 3339, 1, 4096] - - [884, 8326.27] + - [887, 8326.27] - - [4096, 3288, 1, 1024] - - [877, 9761.38] + - [880, 9761.38] - - [92, 92, 688, 64] - - [917, 4903.77] + - [920, 4903.77] - - [1024, 3141, 1, 4096] - - [895, 7975.54] + - [898, 7975.54] - - [1024, 3168, 1, 4096] - - [895, 8083.64] + - [898, 8083.64] - - [4096, 3488, 1, 1024] - - [877, 9646.67] + - [880, 9646.67] - - [4096, 3046, 1, 1024] - - [876, 9767.48] + - [879, 9767.48] - - [1024, 3362, 1, 4096] - - [898, 8458.05] + - [901, 8458.05] - - [33708, 3942, 1, 1024] - - [876, 10060.3] + - [879, 10060.3] - - [4096, 3399, 1, 1024] - - [877, 9406.47] + - [880, 9406.47] - - [1024, 3720, 1, 1024] - - [879, 8639.06] + - [882, 8639.06] - - [4096, 3563, 1, 1024] - - [875, 9836.45] + - [878, 9836.45] - - [1024, 3273, 1, 4096] - - [898, 8221.52] + - [901, 8221.52] - - [4096, 3162, 1, 1024] - - [877, 9400.09] + - [880, 9400.09] - - [1024, 3467, 1, 4096] - - [896, 8342.32] + - [899, 8342.32] - - [1024, 3130, 1, 4096] - - [897, 7933.78] + - [900, 7933.78] - - [1024, 3405, 1, 4096] - - [904, 8406.49] + - [907, 8406.49] - - [4096, 3362, 1, 1024] - - [875, 9311.94] + - [878, 9311.94] - - [1024, 3960, 1, 1024] - - [879, 9082.16] + - [882, 9082.16] - - [2048, 128, 1, 4096] - - [929, 5986.52] + - [932, 5986.52] - - [1024, 3712, 1, 36548] - - [927, 9456.15] + - [930, 9456.15] - - [1024, 128, 1, 1024] - - [930, 3631.43] + - [933, 3631.43] - - [3072, 128, 1, 4096] - - [926, 6145.5] + - [929, 6145.5] - - [1024, 3712, 1, 1024] - - [928, 8933.88] + - [931, 8933.88] - - [256, 256, 192, 64] - - [933, 8264.64] + - [936, 8264.64] - - [768, 4096, 1, 768] - - [946, 9642.08] + - [949, 9642.08] - - [768, 64, 1, 768] - - [943, 1850.43] + - [946, 1850.43] - - [768, 1280, 1, 768] - - [946, 8738.13] + - [949, 8738.13] - - [30522, 320, 1, 768] - - [947, 9733.59] + - [950, 9733.59] - - [128, 128, 96, 64] - - [936, 5470.83] + - [939, 5470.83] - - [2, 16, 1, 768] - - [939, 2.47742] + - [942, 2.47742] - - [30522, 1280, 1, 768] - - [945, 10127.9] + - [948, 10127.9] - - [30522, 640, 1, 768] - - [946, 9987.61] + - [949, 9987.61] - - [2, 8, 1, 768] - - [938, 0.96] + - [941, 0.96] - - [768, 4096, 1, 3072] - - [948, 9479.41] + - [951, 9479.41] - - [768, 32, 1, 768] - - [942, 880.334] + - [945, 880.334] - - [2, 64, 1, 768] - - [939, 9.99024] + - [942, 9.99024] - - [256, 256, 96, 64] - - [933, 7614.47] + - [936, 7614.47] - - [64, 64, 768, 64] - - [935, 5354.43] + - [938, 5354.43] - - [30522, 160, 1, 768] - - [944, 7740.11] + - [947, 7740.11] - - [768, 320, 1, 768] - - [937, 5423.67] + - [940, 5423.67] - - [128, 128, 384, 64] - - [934, 7179.98] + - [937, 7179.98] - - [768, 16, 1, 768] - - [940, 706.376] + - [943, 706.376] - - [3072, 4096, 1, 768] - - [949, 9961.74] + - [952, 9961.74] - - [2048, 512, 1, 100] - - [951, 5180.71] + - [954, 5180.71] - - [1024, 200, 1, 560] - - [952, 4061.19] + - [955, 4061.19] - - [256, 1280, 1, 1024] - - [959, 4337.44] + - [962, 4337.44] - - [256, 44505, 1, 1024] - - [995, 8597.69] + - [998, 8597.69] - - [10240, 8976, 1, 256] - - [998, 9471.43] + - [1001, 9471.43] - - [256, 7168, 1, 1024] - - [989, 6718.56] + - [992, 6718.56] - - [8448, 8976, 1, 256] - - [981, 9601.31] + - [984, 9601.31] - - [18944, 8976, 1, 256] - - [990, 9666.26] + - [993, 9666.26] - - [256, 19200, 1, 1024] - - [966, 7488.94] + - [969, 7488.94] - - [5632, 8976, 1, 256] - - [978, 9358.39] + - [981, 9358.39] - - [256, 23552, 1, 1024] - - [993, 7980.89] + - [996, 7980.89] - - [256, 6656, 1, 1024] - - [993, 6287.22] + - [996, 6287.22] - - [256, 14336, 1, 1024] - - [988, 7049.26] + - [991, 7049.26] - - [256, 12544, 1, 1024] - - [966, 6728.47] + - [969, 6728.47] - - [2048, 684, 1, 768] - - [983, 8479.18] + - [986, 8479.18] - - [5376, 8976, 1, 256] - - [978, 9519.51] + - [981, 9519.51] - - [256, 5888, 1, 1024] - - [998, 6012.4] + - [1001, 6012.4] - - [19968, 8976, 1, 256] - - [990, 9684.67] + - [993, 9684.67] - - [3840, 8976, 1, 256] - - [975, 9461.89] + - [978, 9461.89] - - [4608, 8976, 1, 256] - - [975, 9305.82] + - [978, 9305.82] - - [256, 684, 1, 1024] - - [1001, 3513.06] + - [1004, 3513.06] - - [256, 22016, 1, 1024] - - [966, 7643.79] + - [969, 7643.79] - - [256, 23296, 1, 1024] - - [995, 8048.12] + - [998, 8048.12] - - [4864, 8976, 1, 256] - - [973, 9545.62] + - [976, 9545.62] - - [256, 7424, 1, 1024] - - [991, 6770.65] + - [994, 6770.65] - - [18176, 8976, 1, 256] - - [998, 9729.47] + - [1001, 9729.47] - - [256, 15104, 1, 1024] - - [987, 7289.08] + - [990, 7289.08] - - [8192, 8976, 1, 256] - - [990, 9395.49] + - [993, 9395.49] - - [256, 16128, 1, 1024] - - [990, 7461.28] + - [993, 7461.28] - - [13312, 8976, 1, 256] - - [998, 9550.97] + - [1001, 9550.97] - - [256, 21504, 1, 1024] - - [995, 7635.93] + - [998, 7635.93] - - [6400, 8976, 1, 256] - - [982, 9560.96] + - [985, 9560.96] - - [256, 8960, 1, 1024] - - [957, 6292.36] + - [960, 6292.36] - - [1792, 8976, 1, 256] - - [972, 9372.18] + - [975, 9372.18] - - [13824, 8976, 1, 256] - - [990, 9585.27] + - [993, 9585.27] - - [11776, 8976, 1, 256] - - [990, 9560.34] + - [993, 9560.34] - - [256, 20992, 1, 1024] - - [988, 7490.65] + - [991, 7490.65] - - [20480, 8976, 1, 256] - - [998, 9610.7] + - [1001, 9610.7] - - [5888, 8976, 1, 256] - - [969, 9565.2] + - [972, 9565.2] - - [256, 10496, 1, 1024] - - [960, 6631.96] + - [963, 6631.96] - - [21248, 8976, 1, 256] - - [990, 9755.77] + - [993, 9755.77] - - [5120, 8976, 1, 256] - - [998, 9244.59] + - [1001, 9244.59] - - [7168, 8976, 1, 256] - - [990, 9388.42] + - [993, 9388.42] - - [2048, 1536, 1, 768] - - [979, 9446.04] + - [982, 9446.04] - - [256, 8192, 1, 1024] - - [984, 6948.89] + - [987, 6948.89] - - [4096, 8976, 1, 256] - - [989, 9115.94] + - [992, 9115.94] - - [3328, 8976, 1, 256] - - [982, 9434.55] + - [985, 9434.55] - - [1280, 8976, 1, 256] - - [980, 9129.8] + - [983, 9129.8] - - [2560, 8976, 1, 256] - - [977, 9199.48] + - [980, 9199.48] - - [3072, 8976, 1, 256] - - [992, 8963.6] + - [995, 8963.6] - - [256, 11776, 1, 1024] - - [970, 6869.8] + - [973, 6869.8] - - [18688, 8976, 1, 256] - - [998, 9726.21] + - [1001, 9726.21] - - [15104, 8976, 1, 256] - - [998, 9715.71] + - [1001, 9715.71] - - [23552, 8976, 1, 256] - - [990, 9648.42] + - [993, 9648.42] - - [6144, 8976, 1, 256] - - [998, 9339.8] + - [1001, 9339.8] - - [12544, 8976, 1, 256] - - [998, 9654.45] + - [1001, 9654.45] - - [256, 11264, 1, 1024] - - [971, 6814.98] + - [974, 6814.98] - - [2048, 114, 1, 512] - - [1002, 4583.5] + - [1005, 4583.5] - - [4352, 8976, 1, 256] - - [982, 9471.4] + - [985, 9471.4] - - [15360, 8976, 1, 256] - - [998, 9583.77] + - [1001, 9583.77] - - [256, 31488, 1, 1024] - - [997, 8438.01] + - [1000, 8438.01] - - [28672, 8976, 1, 256] - - [990, 9688.85] + - [993, 9688.85] - - [256, 18176, 1, 1024] - - [966, 7405.09] + - [969, 7405.09] - - [9728, 8976, 1, 256] - - [998, 9524.15] + - [1001, 9524.15] - - [256, 2816, 1, 1024] - - [962, 5405.66] + - [965, 5405.66] - - [256, 18944, 1, 1024] - - [966, 7503.41] + - [969, 7503.41] - - [256, 3584, 1, 1024] - - [965, 6107.15] + - [968, 6107.15] - - [7936, 8976, 1, 256] - - [978, 9608.31] + - [981, 9608.31] - - [19712, 8976, 1, 256] - - [998, 9736.25] + - [1001, 9736.25] - - [256, 14848, 1, 1024] - - [971, 7163.42] + - [974, 7163.42] - - [256, 8448, 1, 1024] - - [971, 6372.56] + - [974, 6372.56] - - [256, 6400, 1, 1024] - - [985, 6395.71] + - [988, 6395.71] - - [256, 6144, 1, 1024] - - [996, 6490.22] + - [999, 6490.22] - - [9472, 8976, 1, 256] - - [975, 9609.92] + - [978, 9609.92] - - [256, 9984, 1, 1024] - - [958, 6484.75] + - [961, 6484.75] - - [684, 8976, 1, 256] - - [967, 8128.53] + - [970, 8128.53] - - [20992, 8976, 1, 256] - - [990, 9689.65] + - [993, 9689.65] - - [2048, 684, 1, 512] - - [974, 7241.78] + - [977, 7241.78] - - [2048, 114, 1, 768] - - [1000, 4872.46] + - [1003, 4872.46] - - [8960, 8976, 1, 256] - - [973, 9603.35] + - [976, 9603.35] - - [2048, 1536, 1, 512] - - [976, 8830.11] + - [979, 8830.11] - - [256, 3328, 1, 1024] - - [964, 5612.55] + - [967, 5612.55] - - [33536, 8976, 1, 256] - - [990, 9797.71] + - [993, 9797.71] - - [2048, 8976, 1, 256] - - [990, 8975.46] + - [993, 8975.46] - - [10496, 8976, 1, 256] - - [981, 9654.43] + - [984, 9654.43] - - [256, 5376, 1, 1024] - - [999, 5626.34] + - [1002, 5626.34] - - [256, 21248, 1, 1024] - - [968, 7525.45] + - [971, 7525.45] - - [256, 13312, 1, 1024] - - [966, 6767.11] + - [969, 6767.11] - - [16128, 8976, 1, 256] - - [990, 9715.57] + - [993, 9715.57] - - [2304, 8976, 1, 256] - - [963, 9433.83] + - [966, 9433.83] - - [256, 4864, 1, 1024] - - [953, 5743.55] + - [956, 5743.55] - - [17152, 8976, 1, 256] - - [998, 9708.94] + - [1001, 9708.94] - - [15872, 8976, 1, 256] - - [998, 9657.57] + - [1001, 9657.57] - - [9984, 8976, 1, 256] - - [975, 9639.74] + - [978, 9639.74] - - [256, 14592, 1, 1024] - - [987, 7223.92] + - [990, 7223.92] - - [256, 33536, 1, 1024] - - [994, 8147.31] + - [997, 8147.31] - - [11264, 8976, 1, 256] - - [990, 9509.96] + - [993, 9509.96] - - [31488, 8976, 1, 256] - - [998, 9799.31] + - [1001, 9799.31] - - [256, 20480, 1, 1024] - - [971, 7498.2] + - [974, 7498.2] - - [44505, 8976, 1, 256] - - [982, 9804.78] + - [985, 9804.78] - - [13568, 8976, 1, 256] - - [990, 9680.24] + - [993, 9680.24] - - [256, 11520, 1, 1024] - - [970, 6805.26] + - [973, 6805.26] - - [256, 7936, 1, 1024] - - [986, 6971.77] + - [989, 6971.77] - - [2048, 256, 1, 768] - - [956, 7129.13] + - [959, 7129.13] - - [256, 4608, 1, 1024] - - [954, 5462.91] + - [957, 5462.91] - - [256, 2304, 1, 1024] - - [961, 4842.69] + - [964, 4842.69] - - [256, 2560, 1, 1024] - - [962, 5309.25] + - [965, 5309.25] - - [2816, 8976, 1, 256] - - [973, 9409.56] + - [976, 9409.56] - - [1728, 320, 1, 64] - - [1009, 3205.57] + - [1012, 3205.57] - - [1152, 128, 1, 784] - - [1056, 3498.96] + - [1059, 3498.96] - - [576, 96, 1, 5329] - - [1042, 3947.92] + - [1045, 3947.92] - - [864, 96, 1, 1225] - - [1063, 3009.67] + - [1066, 3009.67] - - [256, 128, 1, 784] - - [1053, 1536.49] + - [1056, 1536.49] - - [1440, 320, 1, 196] - - [1006, 4824.62] + - [1009, 4824.62] - - [192, 48, 1, 1225] - - [1084, 820.465] + - [1087, 820.465] - - [2592, 384, 1, 289] - - [1024, 7353.01] + - [1027, 7353.01] - - [192, 80, 36, 10368] - - [1074, 5360.04] + - [1077, 5360.04] - - [896, 192, 1, 289] - - [1041, 3076.56] + - [1044, 3076.56] - - [768, 128, 1, 289] - - [1066, 2351.81] + - [1069, 2351.81] - - [64, 256, 1, 3136] - - [1092, 1809.16] + - [1095, 1809.16] - - [1280, 384, 1, 64] - - [1006, 3171.1] + - [1009, 3171.1] - - [512, 144, 1, 196] - - [1064, 1445.07] + - [1067, 1445.07] - - [1344, 192, 1, 289] - - [1047, 4376.52] + - [1050, 4376.52] - - [288, 64, 1, 21609] - - [1058, 3396.12] + - [1061, 3396.12] - - [400, 32, 1, 784] - - [1085, 922.353] + - [1088, 922.353] - - [288, 32, 1, 21609] - - [1096, 2816.01] + - [1099, 2816.01] - - [1280, 448, 1, 64] - - [1009, 3253.56] + - [1012, 3253.56] - - [3456, 256, 1, 169] - - [1021, 5822.44] + - [1024, 5822.44] - - [2304, 256, 1, 196] - - [1019, 4931.98] + - [1022, 4931.98] - - [384, 192, 1, 1225] - - [1067, 2720.39] + - [1070, 2720.39] - - [832, 48, 1, 49] - - [1062, 344.518] + - [1065, 344.518] - - [832, 192, 1, 49] - - [1044, 1099.36] + - [1047, 1099.36] - - [1280, 192, 1, 64] - - [1045, 2069.56] + - [1048, 2069.56] - - [192, 32, 1, 784] - - [1084, 459.627] + - [1087, 459.627] - - [288, 48, 1, 1225] - - [1091, 1176.0] + - [1094, 1176.0] - - [512, 112, 1, 196] - - [1059, 1277.21] + - [1062, 1277.21] - - [224, 192, 36, 2592] - - [1076, 7369.56] + - [1079, 7369.56] - - [528, 32, 1, 196] - - [1050, 440.374] + - [1053, 440.374] - - [192, 128, 36, 1568] - - [1075, 8245.76] + - [1078, 8245.76] - - [4032, 384, 1, 64] - - [1020, 5898.24] + - [1023, 5898.24] - - [576, 64, 1, 3136] - - [1065, 2671.11] + - [1068, 2671.11] - - [2048, 32, 1, 1001] - - [1067, 2323.0] + - [1070, 2323.0] - - [480, 64, 1, 196] - - [1052, 752.64] + - [1055, 752.64] - - [512, 256, 1, 196] - - [1054, 2528.55] + - [1057, 2528.55] - - [864, 96, 1, 289] - - [1064, 1958.4] + - [1067, 1958.4] - - [896, 128, 1, 289] - - [1067, 2725.73] + - [1070, 2725.73] - - [192, 64, 1, 784] - - [1082, 898.675] + - [1085, 898.675] - - [1200, 64, 1, 1225] - - [1066, 2780.14] + - [1069, 2780.14] - - [1296, 288, 1, 196] - - [1005, 3826.18] + - [1008, 3826.18] - - [576, 96, 1, 5041] - - [1046, 3795.58] + - [1049, 3795.58] - - [1024, 256, 1, 289] - - [1035, 4488.13] + - [1038, 4488.13] - - [1024, 2048, 1, 49] - - [1025, 5077.1] + - [1028, 5077.1] - - [192, 64, 36, 6272] - - [1069, 7514.98] + - [1072, 7514.98] - - [4096, 512, 1, 4096] - - [1031, 10276.0] + - [1034, 10276.0] - - [192, 32, 1, 1225] - - [1085, 556.686] + - [1088, 556.686] - - [1024, 256, 1, 196] - - [1045, 3892.44] + - [1048, 3892.44] - - [1120, 192, 1, 289] - - [1034, 3752.81] + - [1037, 3752.81] - - [400, 48, 1, 196] - - [1059, 480.0] + - [1062, 480.0] - - [1728, 224, 1, 1225] - - [1012, 5575.77] + - [1015, 5575.77] - - [800, 96, 1, 784] - - [1066, 2668.94] + - [1069, 2668.94] - - [1152, 384, 1, 64] - - [1016, 3077.34] + - [1019, 3077.34] - - [4608, 512, 1, 49] - - [1023, 4676.6] + - [1026, 4676.6] - - [1792, 256, 1, 289] - - [1016, 5345.94] + - [1019, 5345.94] - - [864, 128, 1, 784] - - [1066, 3816.2] + - [1069, 3816.2] - - [1728, 384, 1, 169] - - [1018, 5191.68] + - [1021, 5191.68] - - [480, 16, 1, 196] - - [1087, 241.231] + - [1090, 241.231] - - [1568, 256, 1, 289] - - [1006, 4723.41] + - [1009, 4723.41] - - [1152, 448, 1, 64] - - [1012, 3356.72] + - [1015, 3356.72] - - [512, 64, 1, 196] - - [1051, 802.816] + - [1054, 802.816] - - [1344, 224, 1, 289] - - [1006, 3519.63] + - [1009, 3519.63] - - [9216, 512, 1, 4096] - - [1029, 9146.02] + - [1032, 9146.02] - - [27, 32, 1, 22201] - - [1097, 264.356] + - [1100, 264.356] - - [1152, 192, 1, 784] - - [1036, 4904.08] + - [1039, 4904.08] - - [1536, 256, 1, 64] - - [1004, 2578.47] + - [1007, 2578.47] - - [800, 128, 1, 196] - - [1066, 1991.11] + - [1069, 1991.11] - - [800, 64, 1, 196] - - [1061, 1150.83] + - [1064, 1150.83] - - [864, 208, 1, 196] - - [1038, 2684.72] + - [1041, 2684.72] - - [1440, 320, 1, 49] - - [1007, 2313.44] + - [1010, 2313.44] - - [512, 128, 1, 784] - - [1057, 2780.32] + - [1060, 2780.32] - - [720, 192, 1, 5041] - - [1032, 5410.46] + - [1035, 5410.46] - - [256, 64, 1, 784] - - [1089, 1163.5] + - [1092, 1163.5] - - [256, 48, 1, 1225] - - [1084, 1075.2] + - [1087, 1075.2] - - [576, 192, 1, 3136] - - [1032, 4833.01] + - [1035, 4833.01] - - [160, 64, 1, 5329] - - [1086, 1753.5] + - [1089, 1753.5] - - [3456, 384, 1, 289] - - [1026, 7341.75] + - [1029, 7341.75] - - [32, 32, 36, 43808] - - [1080, 1378.03] + - [1083, 1378.03] - - [1344, 512, 1, 64] - - [1005, 3822.93] + - [1008, 3822.93] - - [192, 16, 1, 784] - - [1085, 228.073] + - [1088, 228.073] - - [3456, 384, 1, 169] - - [1022, 6675.02] + - [1025, 6675.02] - - [1152, 256, 1, 196] - - [1015, 3211.26] + - [1018, 3211.26] - - [1728, 192, 1, 1225] - - [1016, 4852.26] + - [1019, 4852.26] - - [2048, 512, 1, 49] - - [1028, 3471.64] + - [1031, 3471.64] - - [576, 96, 1, 1225] - - [1059, 2176.66] + - [1062, 2176.66] - - [512, 2048, 1, 49] - - [1010, 3845.83] + - [1013, 3845.83] - - [1728, 192, 1, 64] - - [1005, 2369.83] + - [1008, 2369.83] - - [832, 256, 1, 49] - - [1035, 1433.6] + - [1038, 1433.6] - - [512, 128, 1, 196] - - [1060, 1459.67] + - [1063, 1459.67] - - [1200, 128, 1, 49] - - [1055, 1069.09] + - [1058, 1069.09] - - [528, 256, 1, 196] - - [1043, 2069.76] + - [1046, 2069.76] - - [256, 512, 1, 784] - - [1066, 4538.89] + - [1069, 4538.89] - - [480, 192, 1, 196] - - [1066, 1792.0] + - [1069, 1792.0] - - [96, 64, 36, 2592] - - [1073, 4845.41] + - [1076, 4845.41] - - [96, 96, 36, 2592] - - [1078, 5111.53] + - [1081, 5111.53] - - [1024, 192, 1, 289] - - [1040, 3431.14] + - [1043, 3431.14] - - [1536, 384, 1, 64] - - [1011, 3166.84] + - [1014, 3166.84] - - [192, 96, 1, 784] - - [1051, 881.14] + - [1054, 881.14] - - [2048, 192, 1, 64] - - [1008, 2330.17] + - [1011, 2330.17] - - [192, 64, 1, 1225] - - [1090, 1100.35] + - [1093, 1100.35] - - [512, 32, 1, 196] - - [1081, 477.867] + - [1084, 477.867] - - [128, 96, 36, 1568] - - [1077, 6649.09] + - [1080, 6649.09] - - [528, 128, 1, 196] - - [1063, 1403.23] + - [1066, 1403.23] - - [128, 512, 1, 784] - - [1053, 2237.81] + - [1056, 2237.81] - - [128, 128, 36, 3136] - - [1070, 6538.77] + - [1073, 6538.77] - - [528, 160, 1, 196] - - [1067, 1642.67] + - [1070, 1642.67] - - [448, 64, 1, 5329] - - [1042, 3264.81] + - [1045, 3264.81] - - [1280, 320, 1, 64] - - [1006, 2776.95] + - [1009, 2776.95] - - [1792, 320, 1, 289] - - [1018, 5204.9] + - [1021, 5204.9] - - [2880, 320, 1, 64] - - [1014, 4336.94] + - [1017, 4336.94] - - [147, 64, 1, 12544] - - [1095, 2430.27] + - [1098, 2430.27] - - [4096, 512, 1, 1001] - - [1030, 9618.99] + - [1033, 9618.99] - - [1536, 32, 1, 1001] - - [1067, 1757.18] + - [1070, 1757.18] - - [512, 160, 1, 196] - - [1063, 1592.89] + - [1066, 1592.89] - - [768, 160, 1, 289] - - [1064, 2757.17] + - [1067, 2757.17] - - [1728, 384, 1, 49] - - [1016, 3102.49] + - [1019, 3102.49] - - [64, 32, 36, 43808] - - [1071, 2626.43] + - [1074, 2626.43] - - [64, 64, 1, 3136] - - [1083, 610.506] + - [1086, 610.506] - - [256, 32, 1, 784] - - [1084, 612.837] + - [1087, 612.837] - - [480, 96, 1, 196] - - [1059, 1055.1] + - [1062, 1055.1] - - [1024, 32, 1, 1001] - - [1049, 1188.43] + - [1052, 1188.43] - - [832, 160, 1, 49] - - [1064, 959.247] + - [1067, 959.247] - - [512, 1024, 1, 196] - - [1007, 4978.7] + - [1010, 4978.7] - - [2048, 64, 1, 1001] - - [1099, 4385.13] + - [1102, 4385.13] - - [2048, 128, 1, 1001] - - [1098, 5764.63] + - [1101, 5764.63] - - [1536, 64, 1, 1001] - - [1100, 3162.03] + - [1103, 3162.03] - - [32, 32, 64, 40000] - - [1134, 2449.4] + - [1137, 2449.4] - - [224, 192, 36, 5184] - - [1129, 7500.12] + - [1132, 7500.12] - - [32, 32, 49, 115200] - - [1135, 1878.28] + - [1138, 1878.28] - - [384, 448, 49, 512] - - [1125, 8945.32] + - [1128, 8945.32] - - [192, 80, 36, 20736] - - [1123, 5412.26] + - [1126, 5412.26] - - [384, 448, 64, 256] - - [1126, 9230.33] + - [1129, 9230.33] - - [96, 64, 64, 18432] - - [1110, 5008.4] + - [1113, 5008.4] - - [224, 192, 64, 4608] - - [1129, 8684.53] + - [1132, 8684.53] - - [96, 96, 49, 3136] - - [1133, 5183.63] + - [1136, 5183.63] - - [224, 192, 64, 2304] - - [1125, 8722.76] + - [1128, 8722.76] - - [64, 32, 49, 57600] - - [1115, 3565.26] + - [1118, 3565.26] - - [384, 448, 36, 256] - - [1124, 8843.41] + - [1127, 8843.41] - - [96, 64, 36, 10368] - - [1117, 4997.46] + - [1120, 4997.46] - - [96, 64, 36, 20736] - - [1119, 5034.77] + - [1122, 5034.77] - - [192, 80, 49, 14400] - - [1115, 4892.22] + - [1118, 4892.22] - - [96, 64, 49, 6272] - - [1136, 5617.04] + - [1139, 5617.04] - - [64, 32, 49, 115200] - - [1114, 3572.57] + - [1117, 3572.57] - - [384, 448, 49, 256] - - [1127, 8858.66] + - [1130, 8858.66] - - [96, 96, 64, 2304] - - [1123, 5379.02] + - [1126, 5379.02] - - [96, 96, 49, 6272] - - [1132, 5235.76] + - [1135, 5235.76] - - [224, 192, 49, 6272] - - [1128, 7629.28] + - [1131, 7629.28] - - [96, 96, 36, 10368] - - [1131, 5281.04] + - [1134, 5281.04] - - [96, 64, 36, 5184] - - [1116, 4945.73] + - [1119, 4945.73] - - [384, 448, 64, 512] - - [1124, 9294.86] + - [1127, 9294.86] - - [224, 192, 49, 3136] - - [1128, 7513.4] + - [1131, 7513.4] - - [384, 448, 36, 512] - - [1130, 8961.38] + - [1133, 8961.38] - - [32, 32, 36, 175232] - - [1138, 1385.5] + - [1141, 1385.5] - - [224, 192, 36, 10368] - - [1129, 7565.73] + - [1132, 7565.73] - - [64, 32, 64, 40000] - - [1114, 4658.85] + - [1117, 4658.85] - - [96, 64, 64, 4608] - - [1113, 5461.6] + - [1116, 5461.6] - - [32, 32, 49, 57600] - - [1135, 1877.01] + - [1138, 1877.01] - - [192, 80, 36, 41472] - - [1121, 5123.59] + - [1124, 5123.59] - - [32, 32, 36, 87616] - - [1134, 1382.32] + - [1137, 1382.32] - - [192, 80, 49, 28800] - - [1114, 4901.95] + - [1117, 4901.95] - - [96, 64, 49, 28800] - - [1111, 4862.5] + - [1114, 4862.5] - - [96, 64, 36, 41472] - - [1118, 5002.26] + - [1121, 5002.26] - - [192, 80, 64, 9216] - - [1109, 5300.55] + - [1112, 5300.55] - - [96, 96, 36, 5184] - - [1131, 5246.24] + - [1134, 5246.24] - - [32, 32, 64, 80000] - - [1139, 2457.11] + - [1142, 2457.11] - - [96, 64, 64, 2304] - - [1137, 6225.74] + - [1140, 6225.74] - - [96, 64, 49, 3136] - - [1136, 5489.02] + - [1139, 5489.02] - - [64, 32, 36, 87616] - - [1114, 2636.29] + - [1117, 2636.29] - - [64, 32, 64, 80000] - - [1114, 4677.64] + - [1117, 4677.64] - - [96, 96, 64, 4608] - - [1120, 5119.63] + - [1123, 5119.63] - - [64, 32, 36, 175232] - - [1115, 2639.83] + - [1118, 2639.83] - - [64, 64, 11, 233600] - - [1170, 1694.16] + - [1173, 1694.16] - - [320, 256, 9, 19584] - - [1146, 7802.45] + - [1149, 7802.45] - - [256, 224, 9, 9792] - - [1161, 7100.97] + - [1164, 7100.97] + - - [128, 128, 11, 3264] + - [1170, 4828.06] - - [256, 256, 9, 4896] - - [1159, 6163.1] + - [1162, 6163.1] - - [320, 256, 9, 4896] - - [1145, 7515.25] + - [1148, 7515.25] - - [224, 192, 9, 19584] - - [1153, 5761.25] + - [1156, 5761.25] - - [192, 192, 11, 3264] - - [1142, 6814.07] + - [1145, 6814.07] - - [64, 64, 11, 116800] - - [1179, 1692.18] + - [1182, 1692.18] - - [64, 64, 9, 172864] - - [1171, 1385.54] + - [1174, 1385.54] - - [192, 128, 11, 6528] - - [1163, 5057.19] + - [1166, 5057.19] - - [64, 64, 11, 58400] - - [1179, 1688.53] + - [1182, 1688.53] - - [192, 160, 9, 19584] - - [1155, 4940.68] + - [1158, 4940.68] - - [128, 128, 9, 9792] - - [1177, 4094.51] + - [1180, 4094.51] + - - [128, 128, 11, 6528] + - [1180, 4780.97] - - [192, 192, 11, 6528] - - [1142, 6918.07] + - [1145, 6918.07] - - [160, 160, 9, 4896] - - [1168, 4545.61] + - [1171, 4545.61] - - [192, 192, 9, 4896] - - [1158, 6156.67] + - [1161, 6156.67] - - [256, 256, 11, 13056] - - [1148, 7526.25] + - [1151, 7526.25] - - [224, 192, 11, 6528] - - [1172, 7333.58] + - [1175, 7333.58] + - - [192, 192, 9, 19584] + - [1176, 5859.95] - - [256, 224, 11, 13056] - - [1146, 6512.15] + - [1149, 6512.15] - - [224, 192, 11, 13056] - - [1175, 6429.18] + - [1178, 6429.18] - - [256, 256, 11, 3264] - - [1143, 7366.03] + - [1146, 7366.03] - - [192, 160, 11, 13056] - - [1155, 5994.41] + - [1158, 5994.41] - - [320, 256, 11, 6528] - - [1152, 8725.7] + - [1155, 8725.7] - - [192, 192, 9, 9792] - - [1155, 5843.92] + - [1158, 5843.92] - - [192, 160, 11, 6528] - - [1164, 6308.46] + - [1167, 6308.46] - - [224, 224, 9, 9792] - - [1165, 6268.31] + - [1168, 6268.31] - - [64, 64, 9, 86432] - - [1170, 1382.91] + - [1173, 1382.91] - - [224, 192, 11, 3264] - - [1173, 7336.37] + - [1176, 7336.37] + - - [128, 128, 9, 19584] + - [1143, 3631.15] - - [224, 224, 11, 6528] - - [1162, 5718.39] + - [1165, 5718.39] - - [160, 160, 11, 13056] - - [1174, 5005.14] + - [1177, 5005.14] - - [160, 160, 9, 19584] - - [1169, 4564.74] + - [1172, 4564.74] - - [192, 128, 9, 19584] - - [1140, 5444.53] + - [1143, 5444.53] - - [192, 160, 9, 9792] - - [1157, 5209.54] + - [1160, 5209.54] - - [224, 224, 9, 19584] - - [1165, 5549.59] + - [1168, 5549.59] - - [192, 192, 11, 13056] - - [1147, 7053.75] + - [1150, 7053.75] - - [192, 128, 9, 4896] - - [1147, 5314.67] + - [1150, 5314.67] - - [320, 256, 9, 9792] - - [1141, 7770.2] + - [1144, 7770.2] - - [320, 256, 11, 13056] - - [1151, 8806.16] + - [1154, 8806.16] + - - [64, 64, 9, 345728] + - [1182, 1386.57] - - [128, 128, 9, 4896] - - [1177, 4041.34] + - [1180, 4041.34] - - [256, 256, 9, 9792] - - [1162, 6138.47] + - [1165, 6138.47] - - [224, 224, 9, 4896] - - [1154, 6936.98] + - [1157, 6936.98] - - [320, 256, 11, 3264] - - [1150, 8630.45] + - [1153, 8630.45] + - - [256, 256, 11, 6528] + - [1145, 7354.98] - - [224, 192, 9, 4896] - - [1174, 6747.03] + - [1177, 6747.03] - - [256, 224, 9, 19584] - - [1149, 5923.69] + - [1152, 5923.69] - - [192, 128, 11, 3264] - - [1160, 4952.72] + - [1163, 4952.72] - - [224, 224, 11, 13056] - - [1162, 5747.58] + - [1165, 5747.58] - - [224, 224, 11, 3264] - - [1165, 5738.78] + - [1168, 5738.78] - - [160, 160, 11, 3264] - - [1163, 5133.73] + - [1166, 5133.73] - - [256, 224, 11, 6528] - - [1156, 6509.68] + - [1159, 6509.68] + - - [128, 128, 11, 13056] + - [1150, 4411.67] - - [192, 160, 9, 4896] - - [1176, 5118.14] + - [1179, 5118.14] - - [256, 224, 11, 3264] - - [1166, 6508.85] + - [1169, 6508.85] - - [160, 160, 9, 9792] - - [1178, 4552.22] + - [1181, 4552.22] - - [192, 160, 11, 3264] - - [1157, 6185.35] + - [1160, 6185.35] - - [256, 256, 9, 19584] - - [1146, 6147.61] + - [1149, 6147.61] + - - [192, 128, 11, 13056] + - [1160, 5112.27] - - [224, 192, 9, 9792] - - [1144, 6657.91] + - [1147, 6657.91] - - [160, 160, 11, 6528] - - [1163, 5254.64] + - [1166, 5254.64] - - [256, 224, 9, 4896] - - [1156, 7023.59] + - [1159, 7023.59] - - [192, 128, 9, 9792] - - [1147, 5400.54] + - [1150, 5400.54] + - - [1024, 6400, 1, 65] + - [1183, 5298.31] + - - [4096, 6400, 1, 256] + - [1184, 9150.88] + - - [4096, 64, 1, 1024] + - [1185, 5482.75] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml index e04f05ccb..5c1b95cb4 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bjlk_SB.yaml @@ -67363,6 +67363,867 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 413 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 414 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 415 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_NLCA1_NLCB1_PGR0_PLR1_TT8_4_USFGRO0_VW4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 416 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 832 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 64 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 417 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -67383,7 +68244,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -67396,21 +68257,170 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 418 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 8 + LSCB: 16 LSPA: 4 LSPB: 8 LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 832 + LdsNumElements: 896 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 64 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -67429,9 +68439,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -67439,8 +68449,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -67491,12 +68501,310 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 413 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SolutionIndex: 419 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 + LdcEqualsLdd: false + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 420 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 896 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 421 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -67509,7 +68817,156 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 1] + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 2 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Source + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 422 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -67640,7 +69097,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 414 + SolutionIndex: 423 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67659,7 +69116,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -67789,7 +69246,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 415 + SolutionIndex: 424 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67808,7 +69265,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -67938,7 +69395,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 416 + SolutionIndex: 425 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -67957,7 +69414,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68087,7 +69544,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 417 + SolutionIndex: 426 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -68106,7 +69563,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68236,7 +69693,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 418 + SolutionIndex: 427 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -68255,7 +69712,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68268,7 +69725,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68277,7 +69734,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -68290,25 +69747,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68321,11 +69778,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68335,13 +69792,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68385,12 +69842,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 419 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 428 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -68403,8 +69860,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68417,7 +69874,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68442,18 +69899,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 896 + LdsNumElements: 1024 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 512 LdsOffsetB: 256 @@ -68470,10 +69927,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -68482,14 +69939,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -68534,17 +69991,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 420 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 + SolutionIndex: 429 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -68553,7 +70010,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68566,7 +70023,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68574,39 +70031,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 4 LSPB: 8 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68619,11 +70076,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68631,14 +70088,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -68683,26 +70140,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 421 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 430 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68715,7 +70172,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68723,39 +70180,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 896 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68768,11 +70225,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -68780,15 +70237,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -68832,26 +70289,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 422 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 431 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -68864,7 +70321,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -68873,7 +70330,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -68886,25 +70343,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 16 + LVCB: 8 LVPA: 2 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -68917,7 +70374,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -68931,11 +70388,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -68981,8 +70438,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 423 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 + SolutionIndex: 432 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 SubGroupA: 16 @@ -69000,7 +70457,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69021,35 +70478,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 32 + LSPA: 8 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 8 LVPA: 2 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -69068,9 +70525,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69078,13 +70535,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -69130,25 +70587,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 424 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 433 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [32, 4, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -69171,7 +70628,7 @@ ExpandPointerSwap: false FractionalLoad: false GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -69184,25 +70641,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 16 + LVCA: 32 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3200 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69216,10 +70673,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69229,13 +70686,13 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69279,12 +70736,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 425 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 434 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -69297,8 +70754,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69337,17 +70794,17 @@ InnerUnroll: 1 KernelLanguage: Source LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 1664 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -69366,9 +70823,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69376,14 +70833,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -69428,17 +70885,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 426 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 435 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -69446,8 +70903,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69468,39 +70925,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 32 - LSCB: 32 + LSCA: 16 + LSCB: 16 LSPA: 8 LSPB: 8 LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 2 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69514,10 +70971,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69525,8 +70982,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -69577,26 +71034,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 427 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 436 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -69609,7 +71066,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 24 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -69634,22 +71091,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 32 + LSPA: 8 + LSPB: 8 + LVCA: 8 LVCB: 8 - LVPA: 2 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 384 + LdsNumElementsAlignedB: 384 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 384 + LdsOffsetB_Blk: 1408 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -69662,10 +71119,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 24 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -69674,15 +71131,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 3 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 3 + NumLoadsPerpendicularB: 3 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -69726,17 +71183,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 428 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 437 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 0 @@ -69744,78 +71201,78 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 32 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69823,13 +71280,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -69875,46 +71332,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 429 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 - SubGroup0: 16 + SolutionIndex: 438 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -69924,47 +71381,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 2 - LVPB: 16 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3200 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 8 - MacroTileA: 64 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -69974,11 +71431,11 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70024,48 +71481,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 430 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 + SolutionIndex: 439 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70073,47 +71530,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 8 - LSPA: 4 - LSPB: 16 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 - LVCB: 4 - LVPA: 2 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1664 - LdsNumElementsAlignedA: 512 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70121,15 +71578,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70173,12 +71630,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 431 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 440 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -70186,35 +71643,35 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70222,43 +71679,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 8 LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -70270,15 +71727,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70322,8 +71779,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 432 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 441 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -70335,33 +71792,33 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 24 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -70371,43 +71828,43 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 384 - LdsNumElementsAlignedB: 384 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 384 - LdsOffsetB_Blk: 1408 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 24 + LoopUnroll: 8 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -70419,15 +71876,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 3 - NumLoadsB: 3 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 3 - NumLoadsPerpendicularB: 3 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -70471,8 +71928,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 433 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x24_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 442 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -70484,13 +71941,13 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -70503,7 +71960,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70511,8 +71968,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -70524,26 +71981,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70556,10 +72013,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -70568,13 +72025,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70620,20 +72077,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 434 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 443 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -70652,7 +72109,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70660,39 +72117,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70705,11 +72162,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70717,7 +72174,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 @@ -70769,24 +72226,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 435 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x08_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 444 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -70801,7 +72258,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70809,39 +72266,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -70854,11 +72311,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -70866,13 +72323,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 PerformanceSyncLocation: -1 @@ -70918,24 +72375,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 436 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 445 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -70950,7 +72407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -70967,36 +72424,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -71015,7 +72472,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -71023,7 +72480,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71067,8 +72524,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 437 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 446 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71085,7 +72542,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71108,7 +72565,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71121,41 +72578,41 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -71172,7 +72629,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71216,17 +72673,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 438 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 447 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -71234,7 +72691,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71257,7 +72714,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71270,17 +72727,17 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1792 LdsNumElementsAlignedA: 512 @@ -71313,15 +72770,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71365,17 +72822,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 439 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 448 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -71383,7 +72840,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71405,39 +72862,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -71451,10 +72908,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71462,15 +72919,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71514,25 +72971,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 440 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 449 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -71555,7 +73012,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71568,21 +73025,21 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 4 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -71592,18 +73049,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -71611,15 +73068,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71663,17 +73120,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 441 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 450 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -71681,8 +73138,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -71703,8 +73160,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71715,19 +73172,19 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 1024 LdsNumElementsAlignedA: 256 @@ -71741,14 +73198,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 16 MacroTile1: 16 MacroTileA: 16 @@ -71760,7 +73217,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -71768,7 +73225,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71812,8 +73269,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 442 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 451 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71825,13 +73282,13 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -71853,7 +73310,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -71866,19 +73323,19 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 @@ -71890,14 +73347,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -71909,15 +73366,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -71961,8 +73418,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 443 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 452 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -71979,8 +73436,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72001,35 +73458,35 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -72048,9 +73505,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -72058,15 +73515,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72110,26 +73567,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 444 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 453 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72150,7 +73607,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -72162,43 +73619,43 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72207,13 +73664,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -72259,26 +73716,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 445 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 454 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72299,7 +73756,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -72307,31 +73764,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72345,9 +73802,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72356,7 +73813,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -72408,25 +73865,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 446 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x016x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 455 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -72449,7 +73906,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72462,25 +73919,25 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 16 - LVCA: 8 - LVCB: 8 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -72494,9 +73951,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72513,7 +73970,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72557,11 +74014,11 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 447 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 456 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false ThreadTile: [2, 2] @@ -72575,7 +74032,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -72597,8 +74054,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -72606,46 +74063,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -72654,15 +74111,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -72706,26 +74163,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 448 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 457 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -72755,7 +74212,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -72855,8 +74312,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 449 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 + SolutionIndex: 458 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -72904,7 +74361,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -73004,8 +74461,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 450 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 459 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -73044,7 +74501,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -73053,30 +74510,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73090,9 +74547,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73101,7 +74558,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -73153,26 +74610,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 451 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 460 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73185,7 +74642,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73205,33 +74662,33 @@ GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 8 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73240,9 +74697,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73250,13 +74707,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -73302,12 +74759,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 452 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 + SolutionIndex: 461 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -73320,8 +74777,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73334,7 +74791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73342,8 +74799,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -73351,46 +74808,46 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 16 LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73399,15 +74856,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73451,25 +74908,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 453 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 462 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -73483,7 +74940,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73491,31 +74948,31 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalReadVectorWidth: 2 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -73529,7 +74986,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -73537,10 +74994,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73548,15 +75005,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -73600,26 +75057,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 454 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 + SolutionIndex: 463 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73632,7 +75089,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73640,8 +75097,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -73649,30 +75106,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73685,10 +75142,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -73697,13 +75154,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -73749,26 +75206,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 455 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 464 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -73781,7 +75238,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -73789,39 +75246,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73834,11 +75291,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73846,7 +75303,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -73898,24 +75355,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 456 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 465 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -73938,39 +75395,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -73984,10 +75441,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -73995,8 +75452,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -74047,25 +75504,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 457 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 466 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -74087,39 +75544,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74133,10 +75590,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74144,15 +75601,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74196,25 +75653,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 458 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 - SubGroup0: 8 + SolutionIndex: 467 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -74236,39 +75693,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 + LSCA: 128 + LSCB: 32 + LSPA: 8 LSPB: 32 - LVCA: 8 + LVCA: 32 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74282,10 +75739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74293,13 +75750,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -74345,26 +75802,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 459 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 468 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74404,12 +75861,12 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 16 - LSPA: 16 - LSPB: 32 + LSPA: 8 + LSPB: 16 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 @@ -74423,14 +75880,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 16 MacroTileA: 32 @@ -74442,15 +75899,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74494,8 +75951,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 460 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 469 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -74512,8 +75969,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74553,12 +76010,12 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 + LSPA: 16 + LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -74572,14 +76029,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -74591,15 +76048,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -74643,8 +76100,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 461 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SolutionIndex: 470 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -74661,8 +76118,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74683,39 +76140,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCA: 16 + LSCB: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -74729,10 +76186,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74740,13 +76197,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -74792,26 +76249,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 462 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SolutionIndex: 471 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74832,56 +76289,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -74889,8 +76346,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -74941,26 +76398,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 463 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 - SubGroup0: 16 + SolutionIndex: 472 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -74998,22 +76455,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 32 - LVCA: 32 + LVCA: 8 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75027,9 +76484,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75038,13 +76495,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -75090,25 +76547,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 464 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_04_04 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SolutionIndex: 473 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -75130,56 +76587,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75187,15 +76644,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75239,25 +76696,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 465 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 + SolutionIndex: 474 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] + VectorWidth: 4 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -75279,56 +76736,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 LVPA: 4 - LVPB: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 8 + MacroTileA: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -75336,15 +76793,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -75388,26 +76845,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 466 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 475 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75437,7 +76894,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -75537,8 +76994,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 467 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 476 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75556,7 +77013,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75586,7 +77043,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -75686,8 +77143,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 468 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 + SolutionIndex: 477 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75705,7 +77162,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75735,7 +77192,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -75835,8 +77292,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 469 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 + SolutionIndex: 478 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -75854,7 +77311,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -75884,7 +77341,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 @@ -75892,22 +77349,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -75921,9 +77378,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -75932,13 +77389,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -75984,17 +77441,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 470 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 + SolutionIndex: 479 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -76033,30 +77490,30 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 - LSPA: 8 + LSPA: 16 LSPB: 32 - LVCA: 32 + LVCA: 16 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -76070,9 +77527,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 8 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -76081,13 +77538,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -76133,17 +77590,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 471 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x008x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG16_04_04 + SolutionIndex: 480 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 SubGroupA: 16 SubGroupB: 4 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -76182,7 +77639,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -76282,8 +77739,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 472 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 + SolutionIndex: 481 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -76306,51 +77763,51 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + KernelLanguage: Source + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -76360,7 +77817,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -76368,10 +77825,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76379,13 +77836,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -76431,46 +77888,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 473 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x016x32_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 482 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76480,22 +77937,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -76509,18 +77966,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76528,8 +77985,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -76580,12 +78037,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 474 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 483 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -76593,33 +78050,33 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + ExpandPointerSwap: false + FractionalLoad: false GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -76629,47 +78086,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 4 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76677,14 +78134,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -76729,96 +78186,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 475 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 484 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 32 + KernelLanguage: Source + LSCA: 128 + LSCB: 64 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76826,8 +78283,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -76878,96 +78335,96 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 476 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x008x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_04_04 + SolutionIndex: 485 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 - LVPB: 16 + KernelLanguage: Source + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -76975,8 +78432,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -77027,26 +78484,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 477 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x32_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 486 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -77059,7 +78516,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -77084,22 +78541,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77112,10 +78569,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77124,14 +78581,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -77176,17 +78633,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 478 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL0_TT08_08_VW04_WG16_16_01 + SolutionIndex: 487 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 0 @@ -77195,7 +78652,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -77233,22 +78690,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Source - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77262,9 +78719,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77273,13 +78730,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -77325,17 +78782,17 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 479 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionIndex: 488 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 0 @@ -77344,27 +78801,27 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77381,23 +78838,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 64 + KernelLanguage: Assembly + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77410,10 +78867,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -77422,14 +78879,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -77474,20 +78931,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 480 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionIndex: 489 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + SuppresssNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -77498,22 +78955,22 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77530,23 +78987,23 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 128 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -77559,11 +79016,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -77571,13 +79028,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -77623,35 +79080,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 481 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SolutionIndex: 490 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 4] + SuppresssNoLoadLoop: true + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77661,8 +79118,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77679,7 +79136,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 64 LSPA: 16 @@ -77772,35 +79229,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 482 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 + SolutionIndex: 491 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppresssNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77810,8 +79267,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -77828,7 +79285,7 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source + KernelLanguage: Assembly LSCA: 64 LSCB: 128 LSPA: 16 @@ -77921,35 +79378,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 483 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 + SolutionIndex: 492 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false + SuppresssNoLoadLoop: true ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false @@ -77959,41 +79416,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 - KernelLanguage: Source - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 16 + LVCB: 64 + LVPA: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 7680 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78007,10 +79464,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 96 + MacroTile1: 128 + MacroTileA: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78018,14 +79475,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 3 + NumLoadsB: 4 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -78070,26 +79527,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 484 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 + SolutionIndex: 493 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppresssNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppresssNoLoadLoop: true + ThreadTile: [6, 8] + ThreadTile0: 6 + ThreadTile1: 8 + ThreadTileA: 6 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78102,7 +79559,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78128,21 +79585,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78155,11 +79612,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -78167,13 +79624,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -78219,18 +79676,18 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 485 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SolutionIndex: 494 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 SuppresssNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -78251,7 +79708,156 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 2 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 7680 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1536 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 96 + MacroTileA: 128 + MacroTileB: 96 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 48 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 4 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 495 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppresssNoLoadLoop: true + ThreadTile: [8, 6] + ThreadTile0: 8 + ThreadTile1: 6 + ThreadTileA: 8 + ThreadTileB: 6 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -78285,13 +79891,13 @@ LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -78304,7 +79910,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 128 MacroTileA: 128 @@ -78318,12 +79924,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -78368,8 +79974,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 486 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 + SolutionIndex: 496 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 @@ -78387,7 +79993,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78517,7 +80123,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 487 + SolutionIndex: 497 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78536,7 +80142,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78666,7 +80272,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 488 + SolutionIndex: 498 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78685,7 +80291,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78815,7 +80421,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 489 + SolutionIndex: 499 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78834,7 +80440,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -78964,7 +80570,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 490 + SolutionIndex: 500 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -78983,7 +80589,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79113,7 +80719,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 491 + SolutionIndex: 501 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -79132,7 +80738,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79262,7 +80868,7 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 492 + SolutionIndex: 502 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -79281,60 +80887,60 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 LVCA: 16 LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79347,11 +80953,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79359,15 +80965,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79411,48 +81017,48 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 493 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SolutionIndex: 503 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 2 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 2 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79464,26 +81070,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 + KernelLanguage: Source + LSCA: 32 + LSCB: 32 + LSPA: 2 + LSPB: 2 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -79496,11 +81102,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79508,15 +81114,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79560,26 +81166,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 494 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [4, 8] + SolutionIndex: 504 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79592,7 +81198,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79600,8 +81206,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79612,44 +81218,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 16 - LSPB: 4 - LVCA: 16 - LVCB: 64 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 128 - MacroTileA: 96 - MacroTileB: 128 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79657,15 +81263,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 3 - NumLoadsB: 4 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -79709,26 +81315,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 495 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 505 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: true - ThreadTile: [6, 8] - ThreadTile0: 6 - ThreadTile1: 8 - ThreadTileA: 6 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79749,56 +81355,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79806,13 +81412,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 @@ -79858,26 +81464,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 496 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SolutionIndex: 506 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppresssNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -79890,7 +81496,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -79898,8 +81504,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -79907,47 +81513,47 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 2 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 2 + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7680 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1536 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 96 - MacroTileA: 128 - MacroTileB: 96 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -79955,15 +81561,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 48 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 4 - NumLoadsB: 3 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 - NumLoadsPerpendicularA: 4 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80007,26 +81613,26 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 497 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [8, 6] - ThreadTile0: 8 - ThreadTile1: 6 - ThreadTileA: 8 - ThreadTileB: 6 + SolutionIndex: 507 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -80039,7 +81645,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80047,39 +81653,39 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 409 + LdsNumElementsAlignedA: 64 + LdsNumElementsAlignedB: 64 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 128 + LdsOffsetB: 64 + LdsOffsetB_Blk: 192 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -80092,11 +81698,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80104,15 +81710,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 64 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80156,46 +81762,46 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 498 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppresssNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SolutionIndex: 508 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppresssNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -80205,22 +81811,22 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 409 LdsNumElementsAlignedA: 64 @@ -80234,18 +81840,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 2 + MacroTile0: 8 + MacroTile1: 8 + MacroTileA: 8 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80253,8 +81859,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -80305,12 +81911,12 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 499 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SolutionIndex: 509 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 + SubGroup0: 4 + SubGroup1: 4 + SubGroupA: 4 + SubGroupB: 4 SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -80318,33 +81924,33 @@ ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] + WorkGroup: [4, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 2 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BufferLoad: false + BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 2 + DepthU: 4 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false + ExpandPointerSwap: true + FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -80353,23 +81959,23 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 - KernelLanguage: Source - LSCA: 32 - LSCB: 32 - LSPA: 2 - LSPB: 2 - LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 409 LdsNumElementsAlignedA: 64 @@ -80390,11 +81996,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -80402,8 +82008,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -80454,24 +82060,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 500 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 + SolutionIndex: 510 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 SuppresssNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 0 + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -80503,7 +82109,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -80603,13 +82209,13 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 501 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG04_04_04 + SolutionIndex: 511 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 SubGroupA: 4 SubGroupB: 4 - SuppresssNoLoadLoop: true + SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -80635,7 +82241,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80652,36 +82258,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 1 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 16 - LSPB: 16 + LSPA: 8 + LSPB: 8 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 512 + LdsNumElementsAlignedA: 128 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 256 + LdsOffsetB: 128 + LdsOffsetB_Blk: 384 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80700,7 +82306,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 @@ -80708,7 +82314,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -80752,13 +82358,13 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 502 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 + SolutionIndex: 512 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppresssNoLoadLoop: true + SuppresssNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -80770,7 +82376,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [8, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - AggressivePerfMode: 1 @@ -80801,7 +82407,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 2 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -80901,8 +82507,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 503 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 + SolutionIndex: 513 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 SubGroupA: 4 @@ -80933,7 +82539,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -80950,36 +82556,36 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 16 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 1024 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -80998,15 +82604,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81050,8 +82656,8 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 504 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_01 + SolutionIndex: 514 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 SubGroupA: 8 @@ -81068,13 +82674,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81088,8 +82693,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81098,48 +82702,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81147,15 +82751,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81199,31 +82801,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 505 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 515 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81231,14 +82833,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 4 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81247,31 +82848,31 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -81284,11 +82885,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81296,15 +82897,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -81348,31 +82947,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 506 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x04_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_01 - SubGroup0: 8 + SolutionIndex: 516 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81381,13 +82980,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81396,48 +82994,186 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 LVPA: 8 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + SolutionIndex: 517 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: true + DirectToLdsA: false + DirectToLdsB: true + DisableKernelPieces: 0 + EdgeType: ShiftPtr + FractionalLoad: 1 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 256 + LSPA: 8 + LSPB: 1 + LVCA: 32 + LVCB: 256 + LVPA: 8 + LVPB: 1 + LdcEqualsLdd: false + LdsNumElements: 2304 + LdsOffsetA: 0 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81445,20 +83181,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81497,31 +83231,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 507 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 518 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81530,13 +83264,12 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false + DirectToLds: true DirectToLdsA: false - DirectToLdsB: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81545,48 +83278,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 256 LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPB: 1 + LVCA: 32 + LVCB: 256 LVPA: 8 - LVPB: 8 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 512 - LdsNumElementsAlignedA: 128 - LdsNumElementsAlignedB: 128 + LdsNumElements: 2304 LdsOffsetA: 0 - LdsOffsetA_Blk: 256 - LdsOffsetB: 128 - LdsOffsetB_Blk: 384 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 256 + MacroTileA: 32 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81594,20 +83323,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularB: 8 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81646,31 +83373,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 508 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_02 + SolutionIndex: 519 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 SubGroup0: 8 - SubGroup1: 8 + SubGroup1: 32 SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SubGroupB: 32 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 32, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81678,14 +83405,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81694,48 +83420,44 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 8 - LSCB: 8 - LSPA: 8 - LSPB: 8 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 409 - LdsNumElementsAlignedA: 64 - LdsNumElementsAlignedB: 64 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 128 - LdsOffsetB: 64 - LdsOffsetB_Blk: 192 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 8 - MacroTile1: 8 - MacroTileA: 8 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81743,20 +83465,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -81795,31 +83515,31 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 509 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 - SubGroup0: 4 - SubGroup1: 4 - SubGroupA: 4 - SubGroupB: 4 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 520 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - - AggressivePerfMode: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true @@ -81833,8 +83553,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true @@ -81843,48 +83562,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 16 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 16 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -81892,14 +83611,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -81944,27 +83661,28 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 510 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppresssNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + SolutionIndex: 521 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: false AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -81982,39 +83700,39 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82029,9 +83747,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82040,12 +83758,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82089,24 +83807,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 511 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_16_01_WGM01 + SolutionIndex: 522 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 @@ -82129,38 +83847,38 @@ EdgeType: ShiftPtr FractionalLoad: 1 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82174,10 +83892,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82185,13 +83903,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -82235,24 +83953,24 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 512 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - ThreadTile: [4, 8] + SolutionIndex: 523 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + SubGroup0: 8 + SubGroup1: 32 + SubGroupA: 8 + SubGroupB: 32 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 32, 1] WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 @@ -82267,42 +83985,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82312,14 +84030,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82332,7 +84050,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -82377,19 +84095,19 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 513 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM01 + SolutionIndex: 524 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 SubGroup1: 32 SubGroupA: 8 SubGroupB: 32 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -82409,42 +84127,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82454,14 +84176,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82469,18 +84191,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 1 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82519,25 +84241,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 514 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM08 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [4, 8] + SolutionIndex: 525 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82551,42 +84273,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true + DepthU: 16 + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 256 - LSPA: 8 - LSPB: 1 - LVCA: 32 - LVCB: 256 - LVPA: 8 - LVPB: 1 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2304 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82596,14 +84318,14 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 256 - MacroTileA: 32 - MacroTileB: 256 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82616,7 +84338,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 @@ -82661,25 +84383,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 515 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x256x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG08_32_01_WGM64 - SubGroup0: 8 - SubGroup1: 32 - SubGroupA: 8 - SubGroupB: 32 + SolutionIndex: 526 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 ThreadTile: [4, 8] ThreadTile0: 4 ThreadTile1: 8 ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82700,35 +84422,39 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82753,18 +84479,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82803,25 +84529,25 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 516 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR0_PLR1_TT04_08_USFGRO01_VW04_WG16_08_01_WGM01 + SolutionIndex: 527 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - ThreadTile: [4, 8] + SubGroupB: 16 + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 @@ -82842,39 +84568,35 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -82889,9 +84611,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -82899,18 +84621,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -82949,33 +84671,34 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 517 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW01_GSU01_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_16_01_WGM01 + SolutionIndex: 528 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -82987,6 +84710,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83006,21 +84730,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 8 + LVPA: 1 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83035,9 +84755,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83049,14 +84769,19 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -83072,6 +84797,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83081,6 +84807,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83095,33 +84822,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 518 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 - SubGroup0: 16 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 529 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83133,8 +84870,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -83147,26 +84885,26 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83180,10 +84918,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 128 - MacroTileA: 32 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83191,13 +84929,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83218,6 +84961,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83227,6 +84971,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83241,44 +84986,55 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 519 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT032x128x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG08_32_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 530 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83298,17 +85054,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 16 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 8 + LVPA: 1 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83321,11 +85077,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83337,9 +85093,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83360,6 +85121,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83369,6 +85131,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83383,12 +85146,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 520 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 531 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 32 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 32 + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 ThreadTile1: 4 @@ -83400,27 +85171,30 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 32, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83441,20 +85215,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 + LSPA: 4 + LSPB: 4 LVCA: 16 LVCB: 16 - LVPA: 4 - LVPB: 4 + LVPA: 1 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83467,7 +85241,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -83479,13 +85253,18 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 64 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83506,6 +85285,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83515,6 +85295,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83529,33 +85310,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 521 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 532 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83567,6 +85358,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83585,18 +85377,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -83610,10 +85402,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -83625,9 +85417,14 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83648,6 +85445,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83657,6 +85455,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83671,33 +85470,43 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 522 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 533 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - - AggressivePerfMode: false + - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true + AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true BufferLoad: true BufferStore: true @@ -83709,6 +85518,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -83774,6 +85584,11 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -83794,6 +85609,7 @@ Index1: 1 IndexAssignmentsA: [0, 3, 2] IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] IndexUnroll: 3 IndexUnrollA: 1 IndexUnrollB: 1 @@ -83803,6 +85619,7 @@ NumIndicesBatch: 1 NumIndicesC: 3 NumIndicesFree: 2 + NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM SilentHighPrecisionAccumulate: false @@ -83817,12 +85634,20 @@ TransposeB: true UseBeta: true UseInitialStrides: false - SolutionIndex: 523 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM08 + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 534 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 SubGroup0: 16 SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -83835,150 +85660,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: false - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3072 - LdsOffsetA: 0 - LdsOffsetB: 1024 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - SolutionIndex: 524 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM08 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -83992,7 +85676,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84017,18 +85701,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 4 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 8 LVPA: 1 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84041,10 +85725,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -84057,9 +85741,9 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84114,14 +85798,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 525 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM1 + SolutionIndex: 535 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -84135,10 +85819,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84152,13 +85836,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84179,20 +85863,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84205,7 +85885,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -84217,13 +85897,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84233,8 +85913,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84278,31 +85958,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 526 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM8 + SolutionIndex: 536 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84316,7 +85996,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84342,17 +86022,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 1 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84365,11 +86045,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -84377,13 +86057,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 64 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84438,20 +86118,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 527 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG8_8_1_WGM64 + SolutionIndex: 537 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -84459,10 +86139,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84476,7 +86156,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -84503,20 +86183,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 + LSPA: 16 + LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 1 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84529,7 +86209,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -84541,13 +86221,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 64 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -84602,31 +86282,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 528 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT8_8_VW4_WG8_8_1_WGM64 + SolutionIndex: 538 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -84762,8 +86442,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 529 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM1 + SolutionIndex: 539 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84784,7 +86464,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84806,7 +86486,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -84834,13 +86514,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -84881,8 +86557,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -84926,8 +86602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 530 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 540 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -84935,7 +86611,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -84948,7 +86624,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -84989,18 +86665,18 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 4 + LSCA: 64 + LSCB: 64 + LSPA: 16 LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85014,10 +86690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85025,13 +86701,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85086,20 +86762,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 531 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM8 + SolutionIndex: 541 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -85107,8 +86783,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85130,7 +86806,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -85158,9 +86834,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85201,8 +86881,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -85246,8 +86926,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 532 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 542 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85255,7 +86935,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -85268,7 +86948,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -85284,43 +86964,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85333,7 +87017,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -85346,11 +87030,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85361,13 +87047,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85406,8 +87093,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 533 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 543 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85415,24 +87102,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85444,47 +87129,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85497,7 +87178,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -85510,11 +87191,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85525,13 +87208,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85570,8 +87254,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 534 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 544 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85579,24 +87263,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85608,43 +87290,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 4 - LSPB: 16 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85657,10 +87339,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -85669,13 +87351,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -85692,6 +87376,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85730,33 +87415,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 535 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT8_4_VW4_WG16_8_1_WGM64 + SolutionIndex: 545 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85768,43 +87451,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85817,11 +87504,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85829,12 +87516,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -85845,13 +87534,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -85890,8 +87580,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 536 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 546 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -85899,24 +87589,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -85928,43 +87616,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -85977,11 +87665,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -85990,12 +87678,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86006,12 +87696,13 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86050,15 +87741,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 537 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_PLR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 547 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -86066,17 +87757,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86088,47 +87777,43 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86141,11 +87826,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86154,12 +87839,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86169,13 +87856,14 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -86214,33 +87902,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 538 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 + SolutionIndex: 548 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true + SubGroupB: 8 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 64 + VectorWidth: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -86258,7 +87944,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -86278,21 +87964,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86307,9 +87989,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86317,15 +87999,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86335,7 +88017,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -86381,28 +88063,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 539 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 549 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -86443,15 +88125,15 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 8 LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LVCB: 16 + LVPA: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 640 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -86468,9 +88150,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86478,15 +88160,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -86542,28 +88224,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 540 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 550 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -86584,7 +88266,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -86604,17 +88286,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -86629,9 +88315,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -86639,14 +88325,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -86657,8 +88343,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -86703,8 +88389,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 541 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 551 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86712,12 +88398,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] + SuppressNoLoadLoop: true + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -86725,7 +88411,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -86868,8 +88554,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 542 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM1 + SolutionIndex: 552 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -86890,7 +88576,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87029,169 +88715,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 543 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdcEqualsLdd: false - LdsNumElements: 768 - LdsOffsetA: 0 - LdsOffsetB: 512 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 544 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 553 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87212,7 +88737,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87252,15 +88777,15 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 768 LdsOffsetA: 0 LdsOffsetB: 512 LdsPadA: 0 @@ -87277,9 +88802,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87287,14 +88812,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -87306,7 +88831,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -87351,8 +88876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 545 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 554 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87361,11 +88886,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -87373,7 +88898,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87512,8 +89037,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 546 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 555 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87534,7 +89059,7 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -87548,13 +89073,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -87583,12 +89108,8 @@ LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87601,7 +89122,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -87615,12 +89136,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -87631,7 +89152,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -87677,8 +89198,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 547 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 556 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -87686,7 +89207,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -87699,174 +89220,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 548 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_SNLL1_TT4_8_USFGRO1_VW1_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -87878,7 +89234,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -87904,17 +89260,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 2 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -87927,11 +89283,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -87939,13 +89295,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88003,8 +89359,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 549 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 557 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88013,11 +89369,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -88025,9 +89381,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88039,7 +89395,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88065,17 +89421,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 2 - LSPB: 4 + LSPB: 8 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88088,11 +89444,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88100,13 +89456,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88164,8 +89520,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 550 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WGM8 + SolutionIndex: 558 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88174,11 +89530,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -88186,9 +89542,9 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88200,7 +89556,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88226,17 +89582,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 2 + LSCB: 32 + LSPA: 4 LSPB: 8 LVCA: 64 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 640 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88249,11 +89605,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88263,13 +89619,13 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88325,15 +89681,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 551 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 559 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -88346,10 +89702,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88361,7 +89717,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88381,23 +89737,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 32 + LSPA: 2 LSPB: 4 LVCA: 64 - LVCB: 64 - LVPA: 4 + LVCB: 32 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88410,11 +89766,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88423,14 +89779,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88486,15 +89842,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 552 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WGM1 + SolutionIndex: 560 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -88506,11 +89862,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88522,7 +89878,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88542,23 +89898,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88571,11 +89927,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88583,13 +89939,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88602,7 +89958,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -88647,8 +90003,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 553 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 561 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88657,21 +90013,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88683,7 +90039,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88703,23 +90059,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 2 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88732,11 +90088,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -88744,13 +90100,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 128 PackBatchDims: 0 @@ -88763,7 +90119,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -88808,8 +90164,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 554 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM1 + SolutionIndex: 562 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -88818,21 +90174,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -88844,7 +90200,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -88864,23 +90220,23 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 8 + LSPA: 2 + LSPB: 4 LVCA: 64 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -88893,7 +90249,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -88905,15 +90261,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -88969,31 +90325,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 555 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WGM8 + SolutionIndex: 563 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89011,37 +90367,41 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89056,9 +90416,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89066,14 +90426,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 PackBatchDims: 0 PackFreeDims: 1 @@ -89084,8 +90444,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -89130,28 +90490,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 556 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 564 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -89166,43 +90526,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89215,11 +90579,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89229,13 +90593,13 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89245,7 +90609,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89291,31 +90655,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 557 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM1 + SolutionIndex: 565 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89327,43 +90691,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89376,11 +90744,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89390,13 +90758,11 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89406,8 +90772,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -89452,31 +90818,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 558 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 566 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroupB: 16 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89488,43 +90856,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89537,11 +90909,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -89549,15 +90921,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89567,7 +90939,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -89613,31 +90985,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 559 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_EPS0_FL0_GRVW1_NLCA1_NLCB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WGM8 + SolutionIndex: 567 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89649,7 +91021,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -89674,22 +91046,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 LSPA: 8 - LSPB: 8 - LVCA: 16 + LSPB: 16 + LVCA: 32 LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89702,10 +91074,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89716,13 +91088,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -89778,14 +91150,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 560 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 568 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [8, 4] @@ -89799,10 +91171,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89821,7 +91193,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -89839,22 +91211,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -89868,9 +91240,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -89879,13 +91251,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -89943,8 +91313,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 561 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 569 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -89953,10 +91323,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -89968,6 +91338,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -89986,7 +91358,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90004,22 +91376,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90033,10 +91405,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90044,12 +91416,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90106,8 +91480,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 562 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 570 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90116,11 +91490,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90131,8 +91505,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90151,7 +91523,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90169,22 +91541,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90198,9 +91570,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90209,13 +91581,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90273,8 +91643,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 563 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM1 + SolutionIndex: 571 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90283,10 +91653,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -90298,6 +91668,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90334,22 +91706,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90363,9 +91735,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90374,13 +91746,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -90438,8 +91810,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 564 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 572 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90448,10 +91820,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -90460,7 +91832,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -90481,7 +91853,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90499,22 +91871,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90528,10 +91900,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90541,10 +91913,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90601,8 +91975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 565 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 573 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90611,11 +91985,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90623,11 +91997,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90646,7 +92018,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90664,22 +92036,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 128 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 2 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -90693,9 +92065,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -90704,13 +92076,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -90768,8 +92138,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 566 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 574 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90778,10 +92148,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] - ThreadTile0: 8 + ThreadTile: [4, 8] + ThreadTile0: 4 ThreadTile1: 8 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -90790,9 +92160,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90811,7 +92183,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90830,17 +92202,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -90859,9 +92231,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -90869,12 +92241,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -90931,8 +92305,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 567 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM1 + SolutionIndex: 575 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -90941,11 +92315,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 8] + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -90953,11 +92327,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -90976,7 +92348,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -90994,22 +92366,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91023,9 +92395,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91034,13 +92406,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -91098,8 +92468,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 568 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 576 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91108,10 +92478,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -91123,6 +92493,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -91159,22 +92531,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91188,9 +92560,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91199,13 +92571,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91263,8 +92635,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 569 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 577 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91273,10 +92645,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -91324,22 +92696,22 @@ GuaranteeNoPartialB: false InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 16 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91353,9 +92725,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91364,11 +92736,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91426,8 +92798,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 570 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WGM8 + SolutionIndex: 578 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91436,10 +92808,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false @@ -91456,7 +92828,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91471,7 +92843,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -91488,23 +92860,24 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91518,9 +92891,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -91529,13 +92902,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -91547,8 +92918,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91593,8 +92965,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 571 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 579 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91602,11 +92974,11 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -91618,10 +92990,12 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91629,47 +93003,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 + LSCA: 64 + LSCB: 128 LSPA: 8 - LSPB: 16 + LSPB: 4 LVCA: 32 - LVCB: 16 - LVPA: 2 - LVPB: 4 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91682,11 +93057,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91695,11 +93070,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -91710,6 +93087,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -91756,8 +93134,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 572 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 580 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91765,28 +93143,26 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91794,7 +93170,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91802,39 +93178,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -91847,11 +93224,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -91859,13 +93236,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -91877,8 +93254,9 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -91923,8 +93301,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 573 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 581 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -91932,14 +93310,14 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -91947,11 +93325,11 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -91959,7 +93337,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -91967,39 +93345,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 32 + LVCA: 64 LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92012,11 +93391,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92024,12 +93403,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -92040,6 +93419,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -92086,8 +93466,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 574 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WGM8 + SolutionIndex: 582 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92095,22 +93475,22 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 8] + SuppressNoLoadLoop: false + ThreadTile: [8, 4] ThreadTile0: 8 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 8 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -92124,48 +93504,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92178,7 +93554,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92191,12 +93567,15 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92207,8 +93586,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -92253,8 +93632,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 575 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_NLCB1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 583 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92269,17 +93648,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92297,42 +93674,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 32 + LVCA: 64 LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92347,9 +93720,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -92357,15 +93730,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92376,7 +93750,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92422,8 +93796,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 576 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM8 + SolutionIndex: 584 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92432,19 +93806,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -92484,22 +93858,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 + LSPA: 4 LSPB: 4 - LVCA: 128 + LVCA: 64 LVCB: 64 - LVPA: 2 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92513,9 +93887,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92524,15 +93898,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92544,7 +93919,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -92589,8 +93964,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 577 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WGM8 + SolutionIndex: 585 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92599,10 +93974,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -92611,7 +93986,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -92625,44 +94000,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 LSPA: 4 - LSPB: 8 + LSPB: 4 LVCA: 64 - LVCB: 32 - LVPA: 2 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -92679,10 +94054,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -92691,13 +94066,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -92754,8 +94132,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 578 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 586 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92764,23 +94142,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92792,44 +94168,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -92842,7 +94222,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -92855,13 +94235,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -92874,7 +94254,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -92920,8 +94300,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 579 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 587 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -92936,15 +94316,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -92956,13 +94336,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -92976,30 +94356,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93007,10 +94391,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93018,14 +94402,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93038,8 +94422,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93084,31 +94468,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 580 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 588 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93120,13 +94504,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -93140,34 +94524,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93175,10 +94555,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93186,14 +94566,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93206,7 +94586,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93252,31 +94632,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 581 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 589 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93295,58 +94675,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 4 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1664 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 128 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93354,14 +94734,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -93375,7 +94753,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -93420,31 +94798,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 582 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93456,65 +94836,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 6400 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 8 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93522,13 +94902,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93588,31 +94966,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 583 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93624,54 +95004,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93679,10 +95059,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93690,13 +95070,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93711,7 +95089,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -93756,31 +95134,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 584 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93792,50 +95172,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -93843,10 +95227,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -93854,13 +95238,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -93874,7 +95256,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -93920,31 +95302,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 585 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 593 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [6, 4] + ThreadTile0: 6 + ThreadTile1: 4 + ThreadTileA: 6 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -93956,7 +95340,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -93976,45 +95360,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 4 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 1664 + LdsNumElements: 3328 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 128 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94022,12 +95406,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94041,7 +95425,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -94086,33 +95470,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 586 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x16_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 594 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94124,54 +95508,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 4 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 LVPA: 4 - LVPB: 16 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6400 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -94180,9 +95564,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 8 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 8 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94190,12 +95574,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94209,7 +95595,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -94254,33 +95640,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 587 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x8x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 595 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94299,7 +95683,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -94318,22 +95702,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94347,10 +95731,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94358,12 +95742,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94422,8 +95808,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 588 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 596 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94432,11 +95818,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -94447,8 +95833,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94486,22 +95870,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 128 LSCB: 64 - LSPA: 5 + LSPA: 4 LSPB: 8 - LVCA: 48 + LVCA: 64 LVCB: 32 - LVPA: 3 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94515,9 +95899,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -94526,8 +95910,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -94590,8 +95974,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 589 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 597 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94600,10 +95984,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -94612,10 +95996,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -94635,7 +96019,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -94644,7 +96028,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -94655,21 +96039,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 64 LSPA: 8 - LSPB: 5 + LSPB: 8 LVCA: 32 - LVCB: 48 + LVCB: 32 LVPA: 4 - LVPB: 3 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94684,9 +96068,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94694,12 +96078,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94758,8 +96144,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 590 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 598 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94768,11 +96154,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -94783,8 +96169,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94796,7 +96180,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -94804,40 +96188,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -94850,11 +96234,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -94862,14 +96246,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -94883,7 +96267,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -94928,8 +96312,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 591 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 599 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -94938,21 +96322,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -94973,7 +96357,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -94986,43 +96370,43 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 32 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 32 - LVCB: 64 + LVCB: 32 LVPA: 4 - LVPB: 2 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95030,14 +96414,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95096,29 +96480,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 592 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 600 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -95132,54 +96516,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 8 - LVCA: 64 + LVCA: 8 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95187,10 +96571,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95198,12 +96582,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95262,37 +96648,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 593 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 601 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95300,7 +96684,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95308,46 +96692,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -95355,10 +96739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95366,14 +96750,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95432,31 +96816,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 594 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 602 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95468,7 +96852,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95476,57 +96860,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 8 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95534,14 +96918,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95600,35 +96984,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 595 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 603 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95636,7 +97020,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95644,7 +97028,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -95652,49 +97036,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 32 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 32 LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LVCB: 8 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95702,13 +97086,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -95768,31 +97152,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 596 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 604 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -95804,7 +97188,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95812,7 +97196,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -95820,32 +97204,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -95858,11 +97242,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -95870,14 +97254,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -95891,7 +97275,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -95936,35 +97320,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 597 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 605 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -95972,7 +97356,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -95992,34 +97376,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96027,10 +97411,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96038,14 +97422,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96104,31 +97488,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 598 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM8 + SolutionIndex: 606 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96140,7 +97524,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96148,46 +97532,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 + LSCA: 128 + LSCB: 64 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 16 - LVPB: 8 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96195,10 +97579,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96206,14 +97590,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96272,35 +97656,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 599 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 607 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -96308,44 +97692,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96355,7 +97739,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -96363,10 +97747,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -96374,13 +97758,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -96395,7 +97777,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -96440,31 +97822,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 600 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x8x32_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT2_2_USFGRO1_VW1_WG16_4_4_WGM8 + SolutionIndex: 608 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96485,7 +97869,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -96498,22 +97882,22 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 64 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96523,27 +97907,29 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -96592,6 +97978,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96608,15 +97995,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 601 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 609 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -96629,8 +98016,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -96644,7 +98031,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96652,36 +98039,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96698,26 +98085,28 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -96760,6 +98149,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96776,8 +98166,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 602 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 610 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96786,21 +98176,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96812,44 +98202,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -96866,25 +98256,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -96928,6 +98318,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -96944,8 +98335,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 603 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 611 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -96954,21 +98345,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96980,7 +98373,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -96988,36 +98381,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -97034,23 +98427,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -97065,7 +98460,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -97094,6 +98489,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -97110,8 +98506,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 604 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_AMAS3_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 612 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97120,21 +98516,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -97148,48 +98544,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97202,11 +98598,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97216,14 +98612,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97283,8 +98677,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 605 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 613 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97293,21 +98687,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97319,48 +98715,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97373,11 +98769,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97387,14 +98783,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97454,8 +98848,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 606 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 614 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97464,21 +98858,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97490,7 +98886,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -97498,40 +98894,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 LVPA: 4 - LVPB: 4 + LVPB: 3 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97544,11 +98940,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97558,12 +98954,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97623,8 +99019,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 607 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 615 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97633,23 +99029,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97661,48 +99057,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -97715,11 +99111,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97729,12 +99125,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -97794,8 +99192,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 608 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 616 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97804,23 +99202,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -97859,17 +99255,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -97888,9 +99284,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -97900,8 +99296,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -97919,7 +99315,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -97965,8 +99361,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 609 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 617 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -97975,11 +99371,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -97991,7 +99387,7 @@ WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98030,17 +99426,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -98059,9 +99455,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98071,8 +99467,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -98136,8 +99532,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 610 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 618 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98146,11 +99542,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98158,11 +99554,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98181,7 +99577,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -98200,22 +99596,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 96 - LSPA: 8 - LSPB: 5 - LVCA: 32 - LVCB: 48 - LVPA: 4 - LVPB: 3 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98229,10 +99625,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98242,12 +99638,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98307,8 +99705,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 611 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 619 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98317,11 +99715,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98329,11 +99727,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98371,22 +99767,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98400,10 +99796,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98415,12 +99811,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98480,8 +99876,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 612 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 620 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98490,11 +99886,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98502,7 +99898,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -98542,22 +99938,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98571,10 +99967,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98588,8 +99984,8 @@ NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98649,8 +100045,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 613 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 621 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98659,11 +100055,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -98694,41 +100090,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -98742,9 +100138,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 128 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -98755,12 +100151,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98820,8 +100218,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 614 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 622 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -98830,23 +100228,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -98866,36 +100262,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -98914,9 +100310,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -98926,14 +100322,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -98947,7 +100343,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -98993,8 +100389,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 615 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 623 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -99003,19 +100399,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -99029,7 +100425,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -99037,56 +100433,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99097,14 +100493,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99164,31 +100560,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 616 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 624 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99200,64 +100596,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -99268,12 +100664,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99333,33 +100731,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 617 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 625 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99371,65 +100767,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99439,14 +100835,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99506,31 +100900,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 618 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 626 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99542,65 +100938,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99610,14 +101006,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99631,7 +101025,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -99677,31 +101071,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 619 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_PGR1_PLR0_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 627 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -99739,22 +101135,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -99768,10 +101164,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99783,12 +101179,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -99848,15 +101244,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 620 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 628 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -99869,8 +101265,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -99891,7 +101287,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -99910,22 +101306,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -99939,10 +101335,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -99954,12 +101350,10 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100019,15 +101413,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 621 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 629 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -100040,14 +101434,16 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100055,48 +101451,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -100109,25 +101505,25 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -100171,7 +101567,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100188,15 +101583,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 622 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 630 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -100204,17 +101599,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100233,73 +101626,73 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100342,7 +101735,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100359,33 +101751,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 623 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG8_16_2_WGM8 + SolutionIndex: 631 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -100397,7 +101787,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -100405,36 +101795,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -100444,7 +101834,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -100452,27 +101842,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -100515,7 +101903,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100532,35 +101919,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 624 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 632 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100568,61 +101955,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -100630,14 +102017,14 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 - MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -100684,7 +102071,6 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM - SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -100701,37 +102087,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 625 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 633 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -100759,44 +102143,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 4 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -100805,13 +102189,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -100871,29 +102255,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 626 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM8 + SolutionIndex: 634 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -100914,7 +102298,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -100934,17 +102318,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -100963,9 +102347,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -100973,14 +102357,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101039,8 +102421,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 627 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 635 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -101049,11 +102431,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -101061,9 +102443,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101082,58 +102466,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101141,14 +102525,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101207,35 +102589,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 628 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 636 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101243,7 +102627,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -101263,7 +102647,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -101278,19 +102662,19 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -101310,13 +102694,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101375,31 +102759,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 629 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 637 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101419,57 +102803,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101477,14 +102861,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101543,35 +102927,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 630 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x16_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 638 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -101579,40 +102963,40 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -101626,18 +103010,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101645,12 +103029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -101709,15 +103095,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 631 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 639 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -101725,17 +103111,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101754,41 +103138,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 + LSCA: 64 + LSCB: 32 + LSPA: 4 LSPB: 8 - LVCA: 16 + LVCA: 64 LVCB: 32 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101802,10 +103186,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101814,10 +103198,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -101877,15 +103263,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 632 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 640 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -101893,17 +103279,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -101922,41 +103306,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 + LSCA: 32 + LSCB: 64 + LSPA: 16 LSPB: 8 - LVCA: 64 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -101970,10 +103354,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -101983,11 +103367,9 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -102047,15 +103429,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 633 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 641 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -102063,15 +103445,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102083,7 +103467,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102091,36 +103475,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102130,18 +103514,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102149,13 +103533,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -102215,35 +103599,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 634 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 642 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -102258,31 +103642,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -102318,13 +103702,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102383,8 +103765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 635 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 643 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -102399,15 +103781,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102419,7 +103803,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -102439,24 +103823,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -102466,7 +103850,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102474,9 +103858,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102485,14 +103869,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102551,14 +103935,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 636 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 + SolutionIndex: 644 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -102571,11 +103955,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102594,57 +103978,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -102653,12 +104037,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -102717,14 +104103,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 637 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 645 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -102733,17 +104119,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102755,14 +104139,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -102771,7 +104155,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -102781,28 +104165,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -102810,10 +104194,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102821,13 +104205,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -102887,31 +104269,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 638 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 646 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -102923,14 +104307,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -102949,14 +104333,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -102970,18 +104354,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -102989,8 +104373,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -103053,15 +104439,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 639 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 647 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103074,12 +104460,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103091,7 +104475,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103099,46 +104483,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 64 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -103146,10 +104530,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103157,14 +104541,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103223,15 +104607,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 640 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM1 + SolutionIndex: 648 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103239,15 +104623,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103266,31 +104650,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -103327,12 +104711,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103391,8 +104773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 641 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 649 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103407,7 +104789,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -103416,6 +104798,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103427,48 +104811,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103481,7 +104865,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -103495,10 +104879,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -103557,8 +104943,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 642 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 650 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -103573,7 +104959,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -103581,9 +104967,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103621,38 +105005,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -103661,13 +105045,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -103727,14 +105111,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 643 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 651 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -103748,7 +105132,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -103763,7 +105147,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -103790,27 +105174,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -103819,9 +105203,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103829,13 +105213,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -103895,15 +105279,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 644 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 652 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -103916,10 +105300,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -103938,41 +105322,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -103986,10 +105370,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -103997,12 +105381,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104061,8 +105447,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 645 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 653 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104071,23 +105457,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104099,7 +105483,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104115,7 +105499,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -104125,39 +105509,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 512 + LdsOffsetB: 256 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104165,14 +105549,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104231,31 +105615,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 646 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 654 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104267,7 +105651,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104275,40 +105659,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104321,7 +105705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -104335,12 +105719,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104399,8 +105783,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 647 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 655 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -104415,7 +105799,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -104423,7 +105807,7 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104435,7 +105819,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -104451,7 +105835,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -104462,27 +105846,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -104491,9 +105875,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104501,13 +105885,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -104567,15 +105951,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 648 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 656 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -104588,10 +105972,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104603,54 +105987,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -104658,10 +106042,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104669,14 +106053,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104735,31 +106117,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 649 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW2_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 657 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -104787,49 +106171,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 - LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -104837,14 +106221,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -104903,28 +106287,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 650 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO1_VW2_WG8_16_2_WGM8 + SolutionIndex: 658 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -104939,48 +106323,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -104993,7 +106377,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -105007,12 +106391,10 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105071,8 +106453,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 651 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 659 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -105087,7 +106469,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -105095,7 +106477,9 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105107,7 +106491,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105115,46 +106499,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105162,10 +106546,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105173,14 +106557,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105239,35 +106623,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 652 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 660 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105282,58 +106666,58 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 32 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 8 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105341,11 +106725,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -105405,33 +106791,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 653 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 661 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105443,7 +106827,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105451,46 +106835,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -105498,10 +106882,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105509,13 +106893,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -105575,31 +106959,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 654 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 662 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105611,64 +106995,64 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -105677,11 +107061,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -105741,14 +107127,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 655 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 663 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -105761,13 +107147,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -105779,7 +107163,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105799,45 +107183,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -105845,14 +107229,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -105911,35 +107295,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 656 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW1_TT2_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 664 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -105947,7 +107331,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -105967,34 +107351,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -106002,10 +107386,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106013,13 +107397,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106079,31 +107463,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 657 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 665 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106122,7 +107506,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106141,22 +107525,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106170,9 +107554,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106181,13 +107565,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106247,8 +107629,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 658 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 666 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106257,10 +107639,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106269,9 +107651,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106283,14 +107667,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106309,22 +107693,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -106337,10 +107721,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106349,14 +107733,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -106415,8 +107797,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 659 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 667 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -106425,10 +107807,10 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -106437,9 +107819,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106477,39 +107861,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 32 + LSCB: 64 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106517,14 +107901,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -106583,28 +107967,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 660 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x16_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 668 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -106619,7 +108003,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -106627,46 +108011,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -106674,9 +108058,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -106685,13 +108069,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -106751,14 +108135,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 661 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 669 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -106767,15 +108151,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106787,14 +108171,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -106813,14 +108197,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -106834,18 +108218,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -106853,8 +108237,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -106917,15 +108303,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 662 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 670 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -106938,12 +108324,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -106955,54 +108339,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107010,10 +108394,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107021,12 +108405,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107085,33 +108471,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 663 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 671 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107123,7 +108507,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107149,22 +108533,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -107177,11 +108561,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107192,11 +108576,11 @@ NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107255,15 +108639,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 664 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM8 + SolutionIndex: 672 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -107276,10 +108660,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107291,7 +108675,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107299,46 +108683,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 16 + LSCB: 32 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -107347,9 +108731,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107357,14 +108741,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107423,15 +108807,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 665 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 673 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -107439,15 +108823,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107459,7 +108843,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -107467,32 +108851,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -107513,11 +108897,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -107525,14 +108909,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -107591,8 +108975,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 666 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 674 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107601,13 +108985,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -107615,7 +108999,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -107759,8 +109143,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 667 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 675 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -107781,13 +109165,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -107815,7 +109199,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -107830,7 +109214,7 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 1792 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 @@ -107842,14 +109226,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -107862,7 +109246,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -107927,28 +109311,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 668 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 676 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -107971,30 +109355,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -108030,13 +109414,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108095,8 +109479,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 669 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG8_8_4_WGM8 + SolutionIndex: 677 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108111,11 +109495,11 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -108123,7 +109507,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108131,7 +109515,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108151,34 +109535,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 16 + LSCB: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108186,10 +109570,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108197,13 +109581,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -108263,31 +109647,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 670 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 678 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -108319,28 +109703,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -108354,10 +109738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108365,14 +109749,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108431,8 +109815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 671 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 679 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -108441,25 +109825,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108467,7 +109851,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108487,45 +109871,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 4 - LSPB: 8 + LSPB: 16 LVCA: 64 - LVCB: 32 + LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108533,14 +109917,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108599,15 +109983,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 672 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 680 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -108619,11 +110003,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -108635,54 +110019,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108690,10 +110074,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -108701,10 +110085,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -108767,15 +110149,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 673 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 681 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -108787,15 +110169,17 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108803,54 +110187,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -108858,9 +110242,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -108869,14 +110253,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -108935,35 +110317,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 674 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM8 + SolutionIndex: 682 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -108971,7 +110355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -108991,34 +110375,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109026,10 +110410,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109037,14 +110421,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109103,15 +110487,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 675 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 683 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] ThreadTile0: 4 @@ -109123,11 +110507,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109139,7 +110523,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109165,39 +110549,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCA: 128 + LSCB: 128 + LSPA: 2 + LSPB: 2 + LVCA: 128 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109205,14 +110589,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109271,35 +110655,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 676 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 684 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109307,54 +110691,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 + LSCA: 16 + LSCB: 32 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109362,10 +110746,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109373,12 +110757,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109437,33 +110823,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 677 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 685 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109475,54 +110859,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109543,9 +110927,11 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -109605,37 +110991,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 678 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 686 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109643,7 +111027,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109663,34 +111047,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -109698,10 +111082,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109709,13 +111093,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -109775,31 +111159,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 679 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_AMAS1_GRVW1_TT4_2_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 687 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -109818,41 +111202,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 - LSPA: 2 - LSPB: 2 - LVCA: 128 - LVCB: 128 - LVPA: 2 - LVPB: 2 + LSCA: 64 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -109866,10 +111250,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -109877,14 +111261,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -109943,8 +111325,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 680 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 688 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -109953,25 +111335,27 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -109979,7 +111363,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -109987,46 +111371,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 64 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 32 LVCB: 32 - LVPA: 16 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110034,10 +111418,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110045,14 +111429,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110111,31 +111495,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 681 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS1_GRVW1_TT2_4_USFGRO1_VW1_WG8_8_4_WGM1 + SolutionIndex: 689 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110147,7 +111531,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110156,56 +111540,56 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 + LSCA: 16 + LSCB: 64 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 4 - LVPB: 8 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 6656 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110213,14 +111597,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110279,31 +111663,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 682 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_AMAS3_GRVW4_TT4_2_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 690 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110315,7 +111699,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110323,46 +111707,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110370,10 +111754,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110381,13 +111765,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -110447,31 +111831,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 683 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 691 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110490,9 +111874,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -110503,28 +111887,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 + LSCA: 128 + LSCB: 128 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 - LVPA: 4 - LVPB: 4 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110538,10 +111922,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110549,8 +111933,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -110613,8 +111999,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 684 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 692 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -110623,23 +112009,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110651,7 +112035,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110667,7 +112051,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -110677,28 +112061,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 8 + LSPA: 16 LSPB: 8 - LVCA: 32 + LVCA: 16 LVCB: 32 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -110706,9 +112090,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -110717,14 +112101,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110783,14 +112167,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 685 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 693 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -110804,10 +112188,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110819,7 +112203,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -110827,40 +112211,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 64 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 16 - LVPB: 4 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -110873,11 +112257,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -110887,12 +112271,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -110951,31 +112335,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 686 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 694 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -110994,57 +112378,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111053,14 +112437,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111119,31 +112501,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 687 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_AMAS3_GRVW2_TT2_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 695 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 4 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111155,7 +112539,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -111163,57 +112547,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 128 + LSCA: 64 + LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 2 - LVPB: 2 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111221,14 +112605,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111287,31 +112671,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 688 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_AMAS3_GRVW4_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 696 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111323,54 +112707,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -111379,9 +112763,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -111389,14 +112773,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -111455,15 +112837,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 689 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 697 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -111475,11 +112857,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111498,57 +112882,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 16 LSPB: 16 - LVCA: 64 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 16 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111557,13 +112941,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -111623,14 +113005,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 690 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW1_TT4_2_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 698 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 2] @@ -111639,15 +113021,17 @@ ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111666,7 +113050,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -111727,6 +113111,8 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -111789,7 +113175,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 691 + SolutionIndex: 699 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -111814,8 +113200,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -111853,38 +113237,38 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -111893,13 +113277,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -111959,14 +113343,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 692 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 700 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -111980,7 +113364,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -112002,31 +113386,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -112063,10 +113447,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112125,8 +113511,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 693 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM1 + SolutionIndex: 701 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112141,7 +113527,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -112150,12 +113536,10 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112163,44 +113547,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 @@ -112210,18 +113594,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112229,12 +113613,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112293,33 +113679,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 694 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 702 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] + VectorWidth: 1 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112331,7 +113715,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112339,46 +113723,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112387,9 +113771,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112397,14 +113781,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112463,15 +113847,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 695 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 703 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -112483,11 +113867,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112499,7 +113883,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112508,7 +113892,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -112521,22 +113905,22 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3328 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -112546,7 +113930,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112554,10 +113938,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -112568,11 +113952,11 @@ NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112631,31 +114015,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 696 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 704 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -112667,7 +114051,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -112675,36 +114059,36 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 8 - LSPB: 8 + LSPB: 16 LVCA: 32 - LVCB: 32 - LVPA: 8 + LVCB: 16 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -112714,7 +114098,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -112722,9 +114106,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -112733,14 +114117,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -112799,14 +114183,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 697 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW4_WG8_8_4_WGM1 + SolutionIndex: 705 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -112815,19 +114199,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -112855,7 +114239,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -112902,7 +114286,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -112967,8 +114351,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 698 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS1_GRVW1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 706 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -112987,9 +114371,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -113003,7 +114387,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113029,14 +114413,14 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 @@ -113050,18 +114434,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113069,8 +114453,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -113135,15 +114519,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 699 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 707 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -113156,178 +114540,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 2 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 2 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 700 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_AMAS3_GRVW2_TT4_2_USFGRO0_VW2_WG16_8_2_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113346,7 +114562,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -113407,8 +114623,6 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -113471,7 +114685,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 701 + SolutionIndex: 708 SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -113496,6 +114710,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113507,7 +114723,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113515,40 +114731,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113561,7 +114777,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -113574,13 +114790,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113639,8 +114855,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 702 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_GRVW1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 709 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113655,15 +114871,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113682,20 +114898,20 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -113703,10 +114919,10 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -113742,13 +114958,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113807,8 +115021,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 703 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 710 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -113827,11 +115041,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -113843,7 +115059,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -113869,22 +115085,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 32 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -113897,11 +115113,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -113914,7 +115130,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -113973,15 +115189,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 704 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG16_8_2_WGM8 + SolutionIndex: 711 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -113994,10 +115210,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -114011,7 +115227,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114037,28 +115253,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7168 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114066,9 +115282,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114077,14 +115293,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114143,14 +115359,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 705 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 712 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -114164,10 +115380,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114186,41 +115402,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 8 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114235,9 +115451,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114245,12 +115461,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114309,8 +115527,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 706 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_8_4_WGM8 + SolutionIndex: 713 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114319,23 +115537,21 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114354,7 +115570,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -114373,39 +115589,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114415,10 +115631,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114477,20 +115695,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 707 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_4_USFGRO0_VW2_WG8_16_2_WGM1 + SolutionIndex: 714 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -114498,12 +115716,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -114523,40 +115739,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114570,9 +115786,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -114581,14 +115797,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114647,8 +115863,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 708 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW4_TT4_4_USFGRO0_VW4_WG8_16_2_WGM1 + SolutionIndex: 715 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114657,19 +115873,19 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -114709,22 +115925,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -114738,10 +115954,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -114751,12 +115967,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -114815,8 +116031,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 709 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x32_SE_AMAS3_GRVW2_TT4_8_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 716 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -114825,11 +116041,11 @@ SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -114837,7 +116053,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -114851,7 +116067,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -114859,46 +116075,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -114907,23 +116123,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -114938,7 +116156,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -114967,12 +116185,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -114983,31 +116203,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 710 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM1 + SolutionIndex: 717 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115019,61 +116239,57 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115081,18 +116297,20 @@ MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -115105,7 +116323,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -115135,12 +116353,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -115151,31 +116371,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 711 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_16_2_WGM8 + SolutionIndex: 718 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115187,7 +116407,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115195,46 +116415,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -115243,23 +116463,25 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -115303,12 +116525,14 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -115319,31 +116543,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 712 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x32_SE_AMAS3_GRVW2_TT8_4_USFGRO0_VW2_WG8_8_4_WGM8 + SolutionIndex: 719 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115444,7 +116668,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -115491,8 +116715,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 713 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 720 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115527,7 +116751,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -115562,9 +116786,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115577,7 +116801,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -115593,12 +116817,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -115659,8 +116883,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 714 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 721 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115683,7 +116907,175 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -115715,7 +117107,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -115764,7 +117156,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -115784,7 +117176,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -115831,8 +117223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 715 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -115851,7 +117243,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -115873,7 +117265,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -115887,7 +117279,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -115902,13 +117294,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -115936,7 +117324,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -115955,8 +117343,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116003,8 +117391,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 716 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116023,7 +117411,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -116039,13 +117427,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -116059,7 +117447,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -116075,8 +117463,12 @@ LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116089,7 +117481,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116104,13 +117496,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116123,8 +117515,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -116171,8 +117563,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 717 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 725 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116191,11 +117583,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116207,7 +117599,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116227,7 +117619,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -116242,9 +117634,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116257,7 +117649,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116272,13 +117664,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116339,8 +117731,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 726 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116359,11 +117751,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116464,7 +117856,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116511,8 +117903,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 727 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116533,7 +117925,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -116547,7 +117939,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116582,9 +117974,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116597,7 +117989,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116613,12 +118005,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116632,7 +118024,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116679,8 +118071,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 728 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116701,9 +118093,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116715,7 +118107,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116750,13 +118142,13 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116769,7 +118161,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116785,12 +118177,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -116804,7 +118196,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -116851,8 +118243,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 729 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -116873,9 +118265,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116887,7 +118279,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -116922,9 +118314,9 @@ LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -116937,7 +118329,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -116953,12 +118345,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117019,8 +118411,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 730 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117041,9 +118433,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117063,30 +118455,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false @@ -117124,13 +118516,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117191,8 +118583,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 731 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117207,13 +118599,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -117227,44 +118619,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117277,7 +118673,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117292,13 +118688,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117311,7 +118707,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117359,8 +118755,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 732 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117375,15 +118771,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117395,7 +118791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117403,40 +118799,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117449,7 +118845,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117464,13 +118860,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117531,8 +118927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 733 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117547,15 +118943,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117567,44 +118963,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117617,7 +119017,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117632,13 +119032,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -117651,7 +119051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -117699,8 +119099,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 734 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117715,15 +119115,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117735,7 +119135,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117743,19 +119143,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -117763,20 +119163,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117789,7 +119189,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117804,7 +119204,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -117871,8 +119271,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 735 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -117891,11 +119291,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -117907,7 +119307,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -117915,19 +119315,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -117935,20 +119335,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -117961,7 +119361,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -117976,7 +119376,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -118043,8 +119443,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 736 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118063,11 +119463,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118086,7 +119486,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -118095,7 +119495,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -118149,8 +119549,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -118215,8 +119613,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 737 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118237,9 +119635,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118251,16 +119651,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -118271,7 +119671,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -118279,20 +119679,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118305,7 +119705,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -118320,9 +119720,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -118387,8 +119785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 738 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118407,11 +119805,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118423,7 +119823,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118431,40 +119831,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118477,10 +119877,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118491,14 +119891,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -118559,8 +119959,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 739 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118569,21 +119969,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118595,7 +119995,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118603,40 +120003,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118649,10 +120049,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118663,14 +120063,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -118731,8 +120131,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 740 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118741,21 +120141,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -118793,22 +120193,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 8 + LSPA: 5 LSPB: 8 - LVCA: 32 + LVCA: 48 LVCB: 32 - LVPA: 4 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118822,9 +120222,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -118835,11 +120235,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -118901,8 +120301,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 741 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -118911,10 +120311,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -118923,10 +120323,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -118939,7 +120339,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -118947,40 +120347,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 96 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 + LSPA: 5 + LSPB: 8 + LVCA: 48 + LVCB: 32 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -118993,10 +120393,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -119007,11 +120407,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -119073,8 +120473,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 742 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119083,22 +120483,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -119137,22 +120537,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119166,10 +120566,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119181,12 +120581,12 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsA: 2 + NumLoadsB: 3 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119247,8 +120647,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 743 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119257,11 +120657,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -119269,7 +120669,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -119290,41 +120690,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 + LSCA: 64 + LSCB: 96 LSPA: 8 - LSPB: 4 + LSPB: 5 LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119338,10 +120738,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119353,9 +120753,7 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -119419,8 +120817,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 744 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119429,21 +120827,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119481,22 +120881,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 96 + LSPA: 8 + LSPB: 5 + LVCA: 32 + LVCB: 48 + LVPA: 4 + LVPB: 3 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119510,10 +120910,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119527,8 +120927,8 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119589,8 +120989,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 745 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119599,11 +120999,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -119611,11 +121011,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119634,41 +121034,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 - LSPB: 8 - LVCA: 48 - LVCB: 32 - LVPA: 3 - LVPB: 4 + LSCA: 64 + LSCB: 128 + LSPA: 4 + LSPB: 2 + LVCA: 64 + LVCB: 128 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -119682,10 +121082,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119695,12 +121095,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119714,7 +121116,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -119761,8 +121163,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 746 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119771,23 +121173,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -119819,24 +121219,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 128 LSPA: 4 - LSPB: 8 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 128 LVPA: 4 - LVPB: 8 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -119855,9 +121255,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -119867,14 +121267,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 3 + NumLoadsB: 4 NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 3 + NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -119888,7 +121288,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -119935,8 +121335,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_6_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 747 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -119945,17 +121345,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -119978,7 +121378,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -119998,17 +121398,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -120027,9 +121427,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120039,8 +121439,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120058,7 +121460,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120105,8 +121507,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 748 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120115,11 +121517,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -120130,8 +121532,6 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120150,7 +121550,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -120170,17 +121570,17 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 96 + LSCB: 128 LSPA: 8 - LSPB: 5 + LSPB: 4 LVCA: 32 - LVCB: 48 + LVCB: 64 LVPA: 4 - LVPB: 3 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 @@ -120199,9 +121599,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -120211,8 +121611,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -120230,7 +121632,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -120277,8 +121679,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x96x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 749 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120287,11 +121689,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -120302,8 +121704,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120323,30 +121723,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -120384,13 +121784,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -120404,7 +121804,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -120451,8 +121851,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 750 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120467,187 +121867,15 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 - LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 743 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: 1 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -120748,7 +121976,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -120795,8 +122023,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 751 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120817,7 +122045,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -120831,27 +122059,27 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 @@ -120859,20 +122087,16 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -120885,7 +122109,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 128 MacroTileA: 64 @@ -120900,7 +122124,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -120919,7 +122143,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -120967,8 +122191,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 752 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -120987,11 +122211,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121010,7 +122234,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -121073,8 +122297,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121139,8 +122361,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 753 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121164,6 +122386,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121182,7 +122406,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -121245,8 +122469,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -121311,8 +122533,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 754 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121336,6 +122558,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121347,44 +122571,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121397,11 +122625,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121413,11 +122641,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 4 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121431,7 +122659,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -121479,8 +122707,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x16_SE_DTL0_EPS0_FL0_GRVW4_PGR0_PLR0_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 755 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121489,21 +122717,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121522,41 +122750,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 32 + LVCA: 128 LVCB: 64 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121570,10 +122798,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121584,10 +122812,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121602,7 +122832,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -121649,8 +122879,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 756 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121659,23 +122889,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121694,41 +122922,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 64 + LSPA: 2 LSPB: 4 - LVCA: 32 + LVCA: 128 LVCB: 64 - LVPA: 4 - LVPB: 2 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -121742,10 +122970,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -121756,10 +122984,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -121821,8 +123051,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 757 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -121831,23 +123061,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -121867,30 +123095,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false @@ -121928,13 +123156,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -121995,8 +123223,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 758 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122011,11 +123239,11 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -122038,31 +123266,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 LVPA: 2 LVPB: 4 LdcEqualsLdd: false @@ -122100,13 +123328,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122120,7 +123346,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122167,8 +123393,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 759 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122183,15 +123409,17 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122203,13 +123431,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122223,28 +123451,24 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 2 + LSPA: 8 LSPB: 4 - LVCA: 128 + LVCA: 32 LVCB: 64 - LVPA: 2 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122257,10 +123481,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122271,14 +123495,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122291,7 +123515,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122339,8 +123563,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 760 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122349,21 +123573,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122375,48 +123599,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 2 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122429,10 +123649,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122443,14 +123663,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -122463,8 +123683,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -122511,8 +123731,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR0_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 761 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122521,21 +123741,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122554,8 +123774,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -122563,32 +123783,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 + LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -122602,9 +123822,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -122615,11 +123835,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -122681,8 +123903,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 762 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -122691,10 +123913,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -122706,8 +123928,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -122725,7 +123945,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122754,22 +123974,26 @@ LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -122803,7 +124027,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -122851,20 +124075,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 763 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -122872,7 +124096,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -122893,7 +124117,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -122907,7 +124131,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -122922,22 +124146,26 @@ LVPA: 8 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -122952,7 +124180,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 @@ -122971,7 +124199,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -123019,29 +124247,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 764 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -123064,7 +124292,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -123077,43 +124305,43 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 + LSCB: 32 LSPA: 8 LSPB: 8 LVCA: 32 LVCB: 32 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 + LdsNumElements: 2048 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 512 LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB_Blk: 768 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123123,8 +124351,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -123191,15 +124419,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 765 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 4] ThreadTile0: 2 @@ -123207,13 +124435,13 @@ ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -123234,41 +124462,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -123283,9 +124511,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123295,14 +124523,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123363,31 +124589,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_2_WGM1 + SolutionIndex: 766 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123425,22 +124653,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 64 - LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 8 - LVPB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -123454,10 +124682,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123469,12 +124697,12 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123535,15 +124763,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_2_WGM1 + SolutionIndex: 767 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -123556,7 +124784,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -123571,7 +124799,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123587,48 +124815,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 8 LSCB: 32 - LSPA: 8 + LSPA: 32 LSPB: 8 - LVCA: 32 + LVCA: 8 LVCB: 32 - LVPA: 8 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3328 LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 512 + LdsOffsetA_Blk: 2048 LdsOffsetB: 256 - LdsOffsetB_Blk: 768 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123639,14 +124867,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123707,31 +124935,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM1 + SolutionIndex: 768 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] + ThreadTile: [2, 2] ThreadTile0: 2 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 2 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -123743,7 +124971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -123759,7 +124987,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -123769,28 +124997,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 16 - LVCA: 16 + LVCA: 8 LVCB: 16 - LVPA: 8 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -123798,9 +125026,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -123811,12 +125039,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -123877,31 +125105,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x16_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_8_2_WGM1 + SolutionIndex: 769 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -123924,56 +125152,56 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 - LVPA: 4 - LVPB: 8 + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -123983,14 +125211,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124051,15 +125279,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 + SolutionIndex: 770 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124067,13 +125295,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124107,28 +125335,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 - LSPB: 8 - LVCA: 8 - LVCB: 32 - LVPA: 32 - LVPB: 8 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124142,10 +125370,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124155,14 +125383,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124223,28 +125451,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 771 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -124259,16 +125487,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124281,26 +125509,26 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124313,11 +125541,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 4 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124329,10 +125557,12 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124393,33 +125623,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x32x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + SolutionIndex: 772 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124439,8 +125667,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124452,27 +125680,27 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 64 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124486,10 +125714,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -124567,15 +125795,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT16x64x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG4_16_4_WGM8 + SolutionIndex: 773 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -124588,8 +125816,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -124603,7 +125831,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -124611,7 +125839,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -124619,28 +125847,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 256 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -124657,10 +125885,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124671,14 +125899,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -124739,8 +125967,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x32_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 774 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124749,21 +125977,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124775,16 +126003,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -124797,26 +126025,26 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 256 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -124829,10 +126057,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -124843,13 +126071,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -124911,8 +126137,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x16x16_SE_DTL0_EPS1_FL0_GRVW2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 775 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -124921,10 +126147,10 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -124935,7 +126161,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -124947,15 +126175,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -124963,49 +126191,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125015,15 +126239,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125035,7 +126259,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125083,15 +126307,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 776 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125099,15 +126323,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125119,15 +126343,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -125135,49 +126359,45 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 64 + LVCB: 32 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 256 + LdsNumElements: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125187,15 +126407,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125207,7 +126427,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125255,15 +126475,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x16_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 777 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125271,15 +126491,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125291,14 +126511,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 + ExpandPointerSwap: false + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -125318,38 +126538,34 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + LSCB: 32 + LSPA: 4 + LSPB: 8 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125359,13 +126575,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumLoadsPerpendicularB: 2 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -125377,7 +126595,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -125425,15 +126643,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x16x32_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 778 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -125446,12 +126664,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125469,7 +126685,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -125483,24 +126699,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 2 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125515,9 +126735,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125527,14 +126747,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125547,8 +126767,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125595,8 +126815,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 779 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125605,17 +126825,17 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -125637,7 +126857,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -125658,17 +126878,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 64 LSPA: 2 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 32 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 768 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125683,9 +126907,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125695,14 +126919,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125715,8 +126939,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125763,8 +126987,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 780 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125773,11 +126997,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -125785,7 +127009,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -125799,44 +127023,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 64 + LVCB: 64 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -125849,11 +127077,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -125863,14 +127091,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -125883,8 +127111,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -125931,8 +127159,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG16_8_1_WGM8 + SolutionIndex: 781 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -125941,21 +127169,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -125975,30 +127203,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126036,13 +127264,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126103,8 +127331,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 782 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126119,11 +127347,11 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -126147,30 +127375,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126209,12 +127437,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126275,8 +127503,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 783 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126291,13 +127519,13 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -126318,31 +127546,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 LVPB: 2 LdcEqualsLdd: false @@ -126381,12 +127609,10 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126447,8 +127673,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 784 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126463,7 +127689,7 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -126472,6 +127698,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126490,7 +127718,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -126553,8 +127781,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -126572,7 +127798,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126619,8 +127845,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 785 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -126641,9 +127867,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126656,47 +127884,211 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 + DirectToLds: true + DirectToLdsA: true + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: true + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 786 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -126709,10 +128101,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126723,14 +128115,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126743,8 +128135,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126791,31 +128183,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 787 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -126835,40 +128227,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -126882,9 +128274,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -126895,12 +128287,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -126914,7 +128306,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -126961,29 +128353,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 788 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -126999,48 +128391,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 64 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 - LVPA: 2 + LVCB: 32 + LVPA: 4 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127053,10 +128441,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127067,12 +128455,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127085,7 +128473,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -127133,31 +128521,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 789 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127173,8 +128561,8 @@ CheckTensorDimAsserts: false DepthU: 8 DirectToLds: true - DirectToLdsA: true - DirectToLdsB: false + DirectToLdsA: false + DirectToLdsB: true DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -127197,18 +128585,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 - LVCA: 128 - LVCB: 32 - LVPA: 1 - LVPB: 4 + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 4 + LVPB: 1 LdcEqualsLdd: false LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127217,15 +128605,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true - LocalWriteUseSgprB: false + LocalWriteUseSgprA: false + LocalWriteUseSgprB: true LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -127237,12 +128625,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127303,29 +128691,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 790 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -127339,44 +128727,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127389,10 +128781,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127403,14 +128795,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127423,8 +128815,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127471,8 +128863,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL0_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM8 + SolutionIndex: 791 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127481,21 +128873,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127514,41 +128906,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 256 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127562,9 +128954,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127575,12 +128967,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127594,7 +128988,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127641,8 +129035,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + SolutionIndex: 792 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127651,23 +129045,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -127679,44 +129071,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 64 LSPA: 8 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 32 - LVPA: 4 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1536 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127729,10 +129125,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127743,12 +129139,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127761,8 +129157,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -127809,8 +129205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x64x16_SE_DTL0_EPS0_FL1_GRVW2_PGR0_PLR1_TT4_4_USFGRO0_VW2_WG8_16_1_WGM1 + SolutionIndex: 793 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127819,21 +129215,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -127848,43 +129244,47 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true + DirectToLds: false DirectToLdsA: false - DirectToLdsB: true + DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 128 LSPA: 4 - LSPB: 1 + LSPB: 2 LVCA: 32 - LVCB: 128 - LVPA: 4 + LVCB: 64 + LVPA: 2 LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -127894,13 +129294,13 @@ LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false - LocalWriteUseSgprB: true + LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 128 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -127911,14 +129311,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -127931,7 +129329,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -127979,8 +129377,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x128x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 794 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -127989,21 +129387,23 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 + ThreadTile: [8, 8] + ThreadTile0: 8 ThreadTile1: 8 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128022,7 +129422,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -128042,21 +129442,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 8 - LSPB: 8 + LSPB: 4 LVCA: 16 - LVCB: 16 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 1 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128071,9 +129471,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128083,14 +129483,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128151,8 +129549,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 795 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -128161,11 +129559,11 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -128173,9 +129571,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128195,32 +129595,32 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -128255,15 +129655,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128319,33 +129719,35 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM8 + SolutionIndex: 796 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -128366,33 +129768,33 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -128427,13 +129829,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128489,37 +129893,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 797 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128531,48 +129935,44 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 4 - LSPB: 2 - LVCA: 32 + LSPB: 4 + LVCA: 64 LVCB: 64 - LVPA: 2 - LVPB: 1 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128585,11 +129985,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128599,13 +129999,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128617,7 +130019,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -128661,37 +130063,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG8_16_1_WGM8 + SolutionIndex: 798 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128703,48 +130105,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCB: 64 + LSPA: 4 LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 2 - LVPB: 1 + LVCA: 64 + LVCB: 64 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128757,11 +130159,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128771,13 +130173,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -128790,7 +130194,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -128833,37 +130237,37 @@ TransposeB: true UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_8_USFGRO0_VW4_WG8_16_1_WGM8 + SolutionIndex: 799 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -128895,28 +130299,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -128931,9 +130335,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -128943,14 +130347,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -128964,7 +130368,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129013,8 +130417,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 800 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129023,19 +130427,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -129076,21 +130480,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129105,9 +130509,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129117,14 +130521,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129187,8 +130591,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 801 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129197,11 +130601,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -129209,7 +130613,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -129223,13 +130627,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -129250,17 +130654,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 + LSCB: 128 LSPA: 4 - LSPB: 4 + LSPB: 2 LVCA: 64 - LVCB: 64 + LVCB: 128 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129273,11 +130681,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129287,13 +130695,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -129307,8 +130715,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -129357,8 +130765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 802 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129367,11 +130775,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -129379,9 +130787,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129393,48 +130801,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129447,11 +130855,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129461,14 +130869,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129482,7 +130888,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -129531,8 +130937,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 803 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129541,21 +130947,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129574,31 +130982,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -129637,12 +131045,10 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129705,8 +131111,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 804 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129721,15 +131127,17 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129748,31 +131156,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 + LSPA: 8 + LSPB: 4 + LVCA: 32 + LVCB: 64 LVPA: 4 LVPB: 2 LdcEqualsLdd: false @@ -129810,13 +131218,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -129830,7 +131236,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -129879,8 +131285,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 805 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -129895,15 +131301,17 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -129915,13 +131323,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -129941,22 +131349,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 4 - LSPB: 2 - LVCA: 64 - LVCB: 128 - LVPA: 4 - LVPB: 2 + LSCA: 128 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 128 + LVCB: 64 + LVPA: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -129969,11 +131373,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -129985,11 +131389,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 @@ -130003,8 +131407,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -130053,8 +131457,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL0_GRVW1_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 806 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -130063,11 +131467,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -130075,9 +131479,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130096,41 +131500,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130145,9 +131549,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130158,12 +131562,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130225,15 +131631,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 807 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -130245,13 +131651,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130270,41 +131674,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 + LSCB: 64 LSPA: 8 - LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 2 LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130319,9 +131723,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130332,12 +131736,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130350,7 +131756,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -130399,15 +131805,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 808 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] ThreadTile0: 4 @@ -130419,13 +131825,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130438,47 +131842,43 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 128 - LSPA: 8 + LSCA: 128 + LSCB: 32 + LSPA: 1 LSPB: 4 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130487,15 +131887,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -130506,12 +131906,14 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130523,7 +131925,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -130573,33 +131975,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_DTL0_EPS1_FL1_GRVW2_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 809 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130611,44 +132011,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 128 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -130661,10 +132065,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -130677,13 +132081,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -130695,8 +132099,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -130745,14 +132149,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_DTL0_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 810 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -130761,15 +132165,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130788,7 +132192,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -130851,8 +132255,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -130919,31 +132321,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 811 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -130963,21 +132367,22 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly @@ -130985,10 +132390,10 @@ LSCB: 64 LSPA: 8 LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LVCA: 32 + LVCB: 32 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false LdsNumElements: 2048 LdsNumElementsAlignedA: 512 @@ -131023,7 +132428,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 @@ -131031,20 +132436,22 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -131093,28 +132500,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 812 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -131130,43 +132537,48 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 - LVCA: 128 + LSCA: 96 + LSCB: 64 + LSPA: 5 + LSPB: 8 + LVCA: 48 LVCB: 32 - LVPA: 1 + LVPA: 3 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131175,15 +132587,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -131193,27 +132605,27 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -131263,31 +132675,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_DTL1_EPS0_FL0_GRVW1_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 813 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT6_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -131307,40 +132721,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 8 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131354,9 +132769,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131368,27 +132783,29 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -131437,14 +132854,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 814 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -131457,9 +132874,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -131481,40 +132898,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: false + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 8 + LSPA: 4 LSPB: 8 - LVCA: 16 - LVCB: 16 + LVCA: 64 + LVCB: 32 LVPA: 2 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131528,9 +132946,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -131542,18 +132960,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -131609,14 +133029,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 815 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -131629,15 +133049,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -131647,28 +133067,28 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 @@ -131676,20 +133096,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131702,7 +133122,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -131711,15 +133131,12 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -131788,8 +133205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 + SolutionIndex: 816 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131808,13 +133225,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -131833,40 +133252,40 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 64 - LSPA: 5 + LSCA: 64 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 48 + LVCA: 32 LVCB: 32 - LVPA: 3 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -131880,24 +133299,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -131963,8 +133381,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT6_4_USFGRO0_VW2_WGM8 + SolutionIndex: 817 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -131973,25 +133391,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132008,42 +133426,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 64 - LSPA: 4 + LSCA: 64 + LSCB: 128 + LSPA: 8 LSPB: 8 - LVCA: 64 + LVCA: 32 LVCB: 32 - LVPA: 2 - LVPB: 4 + LVPA: 4 + LVPB: 2 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132057,26 +133475,23 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -132142,8 +133557,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 818 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132152,23 +133567,25 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132186,41 +133603,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 8 LSPB: 8 - LVCA: 64 - LVCB: 32 + LVCA: 16 + LVCB: 16 LVPA: 2 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132234,26 +133651,25 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -132268,7 +133684,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -132317,14 +133733,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 819 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [8, 4] @@ -132337,15 +133753,15 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132355,49 +133771,49 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132410,7 +133826,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -132425,10 +133841,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132444,7 +133862,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -132493,8 +133911,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 820 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132509,19 +133927,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132538,42 +133954,42 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132588,9 +134004,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132599,12 +134015,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132620,7 +134038,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -132669,8 +134087,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 821 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132679,25 +134097,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132707,49 +134123,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 128 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 64 + LVCB: 64 LVPA: 4 - LVPB: 2 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132762,11 +134174,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132775,12 +134187,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132795,7 +134209,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -132845,8 +134259,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 822 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -132855,25 +134269,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -132884,48 +134296,44 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 64 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 128 + LVCB: 32 + LVPA: 1 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1280 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -132934,15 +134342,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -132953,10 +134361,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -132971,7 +134381,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133021,15 +134431,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG8_16_1_WGM1 + SolutionIndex: 823 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_AMAS3_DTL1_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [8, 4] ThreadTile0: 8 @@ -133037,17 +134447,15 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133059,49 +134467,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133114,7 +134523,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -133128,9 +134537,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -133150,13 +134557,14 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133180,6 +134588,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133199,8 +134608,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 824 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133215,15 +134624,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133239,45 +134650,46 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 64 - LSPA: 4 + LSCB: 128 + LSPA: 8 LSPB: 4 - LVCA: 64 + LVCA: 32 LVCB: 64 LVPA: 4 - LVPB: 4 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133292,9 +134704,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133303,13 +134715,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 @@ -133326,13 +134736,14 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133356,6 +134767,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133375,8 +134787,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR1_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 825 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_NLCA1_NLCB1_PGR1_PLR1_TT4_8_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133385,21 +134797,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -133415,41 +134829,42 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 64 - LVCB: 64 - LVPA: 4 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 2 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133463,9 +134878,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -133475,14 +134890,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -133505,6 +134920,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133528,6 +134944,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133547,8 +134964,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 826 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_NLCA1_NLCB1_PGR0_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -133557,19 +134974,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -133584,12 +135001,13 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -133603,7 +135021,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -133611,17 +135029,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 32 - LSPA: 1 - LSPB: 4 + LSCB: 128 + LSPA: 2 + LSPB: 2 LVCA: 128 - LVCB: 32 - LVPA: 1 - LVPB: 4 + LVCB: 128 + LVPA: 2 + LVPB: 2 LdcEqualsLdd: false - LdsNumElements: 1280 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -133630,15 +135052,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -133647,15 +135069,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -133669,7 +135091,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -133677,6 +135099,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -133700,6 +135123,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: true @@ -133719,29 +135143,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 - SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x32x8_SE_AMAS3_DTL1_EPS0_GRVW1_GSU1_NLCA1_NLCB1_PBD0_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 827 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_NLCA1_NLCB1_PGR1_PLR1_TT8_8_USFGRO1_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - [2, 3, 0, 1] @@ -138979,5220 +140403,5236 @@ - [400, 8824.43] - - [784, 128, 64, 512] - [402, 9393.09] + - - [65, 1024, 1, 6400] + - [413, 3556.98] + - - [256, 4096, 1, 6400] + - [414, 10132.4] + - - [1024, 4096, 1, 64] + - [415, 6918.44] + - - [1024, 4096, 1, 6336] + - [416, 10393.9] - - [1024, 128, 1, 128] - - [417, 1028.02] + - [421, 1028.02] - - [4, 704, 1, 1280] - - [456, 363.355] + - [460, 363.355] - - [4, 1856, 1, 3328] - - [456, 579.434] + - [460, 579.434] - - [1856, 448, 1, 3328] - - [493, 6966.73] + - [497, 6966.73] - - [2944, 4288, 1, 1280] - - [488, 9057.88] + - [492, 9057.88] - - [2368, 64, 1, 3328] - - [449, 5837.56] + - [453, 5837.56] - - [2368, 5888, 1, 256] - - [493, 9111.06] + - [497, 9111.06] - - [128, 64, 1, 256] - - [455, 374.491] + - [459, 374.491] - - [5888, 1024, 1, 1280] - - [498, 8570.44] + - [502, 8570.44] - - [128, 6784, 1, 3328] - - [461, 7703.86] + - [465, 7703.86] - - [64, 4, 1, 256] - - [507, 11.2219] + - [511, 11.2219] - - [5888, 1856, 1, 3328] - - [493, 9394.3] + - [497, 9394.3] - - [5056, 704, 1, 256] - - [496, 8026.89] + - [500, 8026.89] - - [5888, 2944, 1, 3328] - - [486, 7608.11] + - [490, 7608.11] - - [1856, 4288, 1, 256] - - [487, 8986.32] + - [491, 8986.32] - - [1024, 5056, 1, 128] - - [479, 3898.24] + - [483, 3898.24] - - [5056, 5056, 1, 3328] - - [487, 9536.75] + - [491, 9536.75] - - [1408, 5888, 1, 1280] - - [488, 9279.09] + - [492, 9279.09] - - [2368, 448, 1, 128] - - [480, 2474.32] + - [484, 2474.32] - - [1024, 3584, 1, 3328] - - [490, 9258.48] + - [494, 9258.48] - - [4, 2944, 1, 1280] - - [442, 611.74] + - [446, 611.74] - - [1408, 64, 1, 128] - - [413, 858.21] + - [417, 858.21] - - [256, 4288, 1, 3328] - - [493, 7615.98] + - [497, 7615.98] - - [5888, 1408, 1, 1280] - - [486, 9620.29] + - [490, 9620.29] - - [704, 1856, 1, 3328] - - [487, 9033.65] + - [491, 9033.65] - - [4, 1408, 1, 128] - - [500, 24.355] + - [504, 24.355] - - [1024, 2368, 1, 256] - - [487, 7526.15] + - [491, 7526.15] - - [1408, 1856, 1, 1280] - - [490, 8324.09] + - [494, 8324.09] - - [1408, 64, 1, 1280] - - [461, 4681.14] + - [465, 4681.14] - - [448, 1024, 1, 1280] - - [487, 7112.43] + - [491, 7112.43] - - [256, 1408, 1, 3328] - - [493, 5825.41] + - [497, 5825.41] - - [5056, 5056, 1, 1280] - - [496, 9233.55] + - [500, 9233.55] - - [448, 5056, 1, 256] - - [488, 7003.17] + - [492, 7003.17] - - [704, 1856, 1, 1280] - - [487, 8877.28] + - [491, 8877.28] - - [128, 5056, 1, 128] - - [479, 2301.04] + - [483, 2301.04] - - [2368, 128, 1, 256] - - [487, 3848.94] + - [491, 3848.94] - - [1856, 1408, 1, 128] - - [482, 4202.21] + - [486, 4202.21] - - [64, 5056, 1, 256] - - [488, 3109.52] + - [492, 3109.52] - - [6784, 256, 1, 3328] - - [487, 6388.43] + - [491, 6388.43] - - [6784, 4288, 1, 3328] - - [498, 9114.57] + - [502, 9114.57] - - [4288, 448, 1, 256] - - [491, 5782.95] + - [495, 5782.95] - - [64, 704, 1, 128] - - [424, 379.419] + - [428, 379.419] - - [1856, 2368, 1, 3328] - - [487, 9128.36] + - [491, 9128.36] - - [4288, 2944, 1, 1280] - - [493, 9182.23] + - [497, 9182.23] - - [704, 5056, 1, 1280] - - [487, 9071.47] + - [491, 9071.47] - - [2368, 704, 1, 3328] - - [493, 7731.33] + - [497, 7731.33] - - [256, 5888, 1, 256] - - [487, 7920.28] + - [491, 7920.28] - - [1856, 4288, 1, 3328] - - [493, 9329.97] + - [497, 9329.97] - - [256, 2944, 1, 256] - - [494, 5312.17] + - [498, 5312.17] - - [5888, 1024, 1, 256] - - [485, 6710.87] + - [489, 6710.87] - - [448, 64, 1, 1280] - - [460, 2814.43] + - [464, 2814.43] - - [448, 5056, 1, 3328] - - [487, 8255.43] + - [491, 8255.43] - - [3584, 4, 1, 1280] - - [436, 640.715] + - [440, 640.715] - - [2944, 64, 1, 256] - - [435, 2621.44] + - [439, 2621.44] - - [128, 4, 1, 1280] - - [507, 86.2316] + - [511, 86.2316] - - [1408, 2944, 1, 256] - - [487, 8848.89] + - [491, 8848.89] - - [256, 1856, 1, 1280] - - [487, 7366.45] + - [491, 7366.45] - - [6784, 5056, 1, 3328] - - [498, 8332.06] + - [502, 8332.06] - - [5056, 5056, 1, 256] - - [493, 9171.64] + - [497, 9171.64] - - [1408, 6784, 1, 128] - - [479, 5079.09] + - [483, 5079.09] - - [64, 1024, 1, 1280] - - [451, 3679.21] + - [455, 3679.21] - - [2944, 4, 1, 256] - - [442, 369.443] + - [446, 369.443] - - [704, 5056, 1, 128] - - [479, 4509.17] + - [483, 4509.17] - - [4, 2368, 1, 1280] - - [436, 569.744] + - [440, 569.744] - - [2368, 2944, 1, 1280] - - [498, 7451.04] + - [502, 7451.04] - - [128, 3584, 1, 1280] - - [496, 6071.16] + - [500, 6071.16] - - [6784, 6784, 1, 1280] - - [493, 9535.64] + - [497, 9535.64] - - [1408, 4288, 1, 1280] - - [496, 8254.99] + - [500, 8254.99] - - [3584, 4288, 1, 1280] - - [498, 9651.09] + - [502, 9651.09] - - [2368, 704, 1, 1280] - - [493, 8291.3] + - [497, 8291.3] - - [5056, 4288, 1, 3328] - - [485, 9406.26] + - [489, 9406.26] - - [3584, 2368, 1, 3328] - - [493, 9350.22] + - [497, 9350.22] - - [64, 704, 1, 1280] - - [460, 3384.49] + - [464, 3384.49] - - [4288, 256, 1, 256] - - [493, 5593.52] + - [497, 5593.52] - - [2944, 128, 1, 128] - - [415, 2130.5] + - [419, 2130.5] - - [6784, 448, 1, 1280] - - [496, 8815.75] + - [500, 8815.75] - - [1408, 2944, 1, 128] - - [479, 4558.24] + - [483, 4558.24] - - [4288, 2944, 1, 256] - - [498, 7865.33] + - [502, 7865.33] - - [5888, 704, 1, 1280] - - [487, 9262.89] + - [491, 9262.89] - - [1856, 64, 1, 1280] - - [461, 4359.05] + - [465, 4359.05] - - [448, 5888, 1, 128] - - [482, 4000.49] + - [486, 4000.49] - - [5888, 64, 1, 3328] - - [462, 6603.29] + - [466, 6603.29] - - [2944, 256, 1, 3328] - - [487, 8423.53] + - [491, 8423.53] - - [1024, 64, 1, 128] - - [432, 582.542] + - [436, 582.542] - - [5056, 2368, 1, 1280] - - [487, 9419.81] + - [491, 9419.81] - - [448, 3584, 1, 1280] - - [487, 7985.72] + - [491, 7985.72] - - [6784, 5888, 1, 256] - - [485, 9494.26] + - [489, 9494.26] - - [704, 1024, 1, 128] - - [479, 2813.25] + - [483, 2813.25] - - [704, 128, 1, 1280] - - [461, 4477.61] + - [465, 4477.61] - - [5888, 2944, 1, 128] - - [482, 4745.86] + - [486, 4745.86] - - [4, 3584, 1, 128] - - [499, 96.379] + - [503, 96.379] - - [1408, 448, 1, 1280] - - [487, 6912.7] + - [491, 6912.7] - - [1024, 1408, 1, 256] - - [495, 5810.75] + - [499, 5810.75] - - [2368, 2368, 1, 3328] - - [496, 9088.61] + - [500, 9088.61] - - [1856, 6784, 1, 128] - - [482, 5168.22] + - [486, 5168.22] - - [5056, 704, 1, 3328] - - [488, 7464.8] + - [492, 7464.8] - - [1408, 1856, 1, 256] - - [493, 6727.59] + - [497, 6727.59] - - [1408, 704, 1, 3328] - - [493, 8379.43] + - [497, 8379.43] - - [2368, 5056, 1, 256] - - [493, 8664.01] + - [497, 8664.01] - - [5888, 1856, 1, 256] - - [498, 5809.92] + - [502, 5809.92] - - [4288, 64, 1, 3328] - - [475, 6583.84] + - [479, 6583.84] - - [2368, 4, 1, 1280] - - [508, 545.151] + - [512, 545.151] - - [704, 5888, 1, 256] - - [493, 8813.61] + - [497, 8813.61] - - [4288, 64, 1, 256] - - [451, 3059.87] + - [455, 3059.87] - - [6784, 64, 1, 256] - - [493, 3490.86] + - [497, 3490.86] - - [2944, 256, 1, 256] - - [487, 6970.3] + - [491, 6970.3] - - [2944, 6784, 1, 3328] - - [487, 9475.69] + - [491, 9475.69] - - [704, 1408, 1, 3328] - - [487, 8154.08] + - [491, 8154.08] - - [3584, 704, 1, 3328] - - [487, 8994.97] + - [491, 8994.97] - - [2944, 256, 1, 128] - - [479, 2824.03] + - [483, 2824.03] - - [6784, 4, 1, 1280] - - [436, 625.614] + - [440, 625.614] - - [1024, 64, 1, 1280] - - [448, 3307.81] + - [452, 3307.81] - - [448, 4288, 1, 256] - - [493, 6074.38] + - [497, 6074.38] - - [64, 3584, 1, 3328] - - [441, 6200.16] + - [445, 6200.16] - - [704, 2368, 1, 1280] - - [487, 8291.3] + - [491, 8291.3] - - [448, 2944, 1, 128] - - [479, 3221.77] + - [483, 3221.77] - - [1856, 2368, 1, 1280] - - [498, 6855.14] + - [502, 6855.14] - - [2368, 128, 1, 3328] - - [449, 6479.51] + - [453, 6479.51] - - [2944, 128, 1, 256] - - [487, 3828.13] + - [491, 3828.13] - - [448, 1408, 1, 256] - - [488, 4525.8] + - [492, 4525.8] - - [1856, 4288, 1, 1280] - - [486, 9160.22] + - [490, 9160.22] - - [64, 5056, 1, 3328] - - [469, 6819.2] + - [473, 6819.2] - - [4, 704, 1, 256] - - [453, 123.441] + - [457, 123.441] - - [1024, 448, 1, 128] - - [482, 1989.17] + - [486, 1989.17] - - [704, 4, 1, 1280] - - [456, 381.831] + - [460, 381.831] - - [704, 256, 1, 128] - - [479, 1109.07] + - [483, 1109.07] - - [704, 2944, 1, 128] - - [479, 4088.93] + - [483, 4088.93] - - [1408, 1024, 1, 1280] - - [493, 8191.98] + - [497, 8191.98] - - [704, 6784, 1, 256] - - [487, 6717.8] + - [491, 6717.8] - - [6784, 704, 1, 256] - - [493, 5429.12] + - [497, 5429.12] - - [5056, 1408, 1, 128] - - [479, 4954.4] + - [483, 4954.4] - - [256, 3584, 1, 3328] - - [487, 7890.86] + - [491, 7890.86] - - [4, 5888, 1, 3328] - - [504, 690.947] + - [508, 690.947] - - [128, 1408, 1, 128] - - [426, 1393.04] + - [430, 1393.04] - - [3584, 4288, 1, 3328] - - [489, 8900.77] + - [493, 8900.77] - - [5888, 1856, 1, 1280] - - [490, 9345.75] + - [494, 9345.75] - - [5056, 1024, 1, 3328] - - [491, 7834.74] + - [495, 7834.74] - - [5056, 64, 1, 1280] - - [469, 5890.04] + - [473, 5890.04] - - [1024, 704, 1, 256] - - [487, 6007.47] + - [491, 6007.47] - - [1024, 4288, 1, 128] - - [481, 3496.99] + - [485, 3496.99] - - [4288, 64, 1, 1280] - - [466, 4726.49] + - [470, 4726.49] - - [2368, 3584, 1, 1280] - - [485, 8128.72] + - [489, 8128.72] - - [2368, 6784, 1, 1280] - - [485, 9478.62] + - [489, 9478.62] - - [1024, 256, 1, 256] - - [493, 4092.0] + - [497, 4092.0] - - [1856, 4, 1, 1280] - - [508, 509.803] + - [512, 509.803] - - [448, 448, 1, 256] - - [493, 3001.18] + - [497, 3001.18] - - [2944, 3584, 1, 3328] - - [494, 9081.81] + - [498, 9081.81] - - [128, 4288, 1, 128] - - [414, 2323.23] + - [418, 2323.23] - - [64, 448, 1, 256] - - [457, 1066.87] + - [461, 1066.87] - - [128, 1024, 1, 3328] - - [470, 6392.26] + - [474, 6392.26] - - [4, 1408, 1, 3328] - - [453, 616.556] + - [457, 616.556] - - [6784, 2944, 1, 256] - - [496, 8547.63] + - [500, 8547.63] - - [64, 1856, 1, 1280] - - [469, 4409.61] + - [473, 4409.61] - - [64, 1024, 1, 128] - - [413, 554.802] + - [417, 554.802] - - [4288, 2368, 1, 3328] - - [489, 8779.98] + - [493, 8779.98] - - [1856, 2368, 1, 256] - - [496, 4976.64] + - [500, 4976.64] - - [3584, 256, 1, 128] - - [481, 2812.27] + - [485, 2812.27] - - [3584, 6784, 1, 3328] - - [491, 9278.12] + - [495, 9278.12] - - [256, 1024, 1, 256] - - [487, 4346.43] + - [491, 4346.43] - - [4, 6784, 1, 3328] - - [506, 681.266] + - [510, 681.266] - - [1024, 5888, 1, 3328] - - [487, 9187.51] + - [491, 9187.51] - - [1024, 128, 1, 1280] - - [439, 3659.95] + - [443, 3659.95] - - [4288, 128, 1, 1280] - - [493, 6019.07] + - [497, 6019.07] - - [5056, 4288, 1, 1280] - - [485, 9343.86] + - [489, 9343.86] - - [5888, 64, 1, 256] - - [487, 4692.07] + - [491, 4692.07] - - [1856, 256, 1, 1280] - - [493, 4790.28] + - [497, 4790.28] - - [64, 5888, 1, 3328] - - [461, 6702.1] + - [465, 6702.1] - - [2944, 5888, 1, 128] - - [482, 5202.55] + - [486, 5202.55] - - [704, 5888, 1, 1280] - - [487, 9264.19] + - [491, 9264.19] - - [2368, 3584, 1, 128] - - [479, 5053.61] + - [483, 5053.61] - - [6784, 5888, 1, 3328] - - [485, 7926.7] + - [489, 7926.7] - - [704, 1024, 1, 1280] - - [486, 5402.5] + - [490, 5402.5] - - [448, 256, 1, 3328] - - [469, 6124.55] + - [473, 6124.55] - - [448, 1856, 1, 128] - - [480, 2885.86] + - [484, 2885.86] - - [128, 1024, 1, 128] - - [414, 1013.12] + - [418, 1013.12] - - [2944, 4, 1, 128] - - [499, 77.5374] + - [503, 77.5374] - - [1024, 704, 1, 1280] - - [487, 7365.48] + - [491, 7365.48] - - [128, 5888, 1, 256] - - [487, 6990.51] + - [491, 6990.51] - - [1024, 5056, 1, 1280] - - [492, 9421.9] + - [496, 9421.9] - - [4288, 1024, 1, 256] - - [494, 6269.93] + - [498, 6269.93] - - [2944, 2368, 1, 128] - - [479, 4918.08] + - [483, 4918.08] - - [704, 704, 1, 3328] - - [487, 7963.55] + - [491, 7963.55] - - [704, 1408, 1, 1280] - - [487, 8347.22] + - [491, 8347.22] - - [5888, 448, 1, 1280] - - [493, 5216.95] + - [497, 5216.95] - - [3584, 256, 1, 3328] - - [487, 7802.15] + - [491, 7802.15] - - [704, 5888, 1, 3328] - - [493, 8381.36] + - [497, 8381.36] - - [704, 1856, 1, 128] - - [479, 3598.28] + - [483, 3598.28] - - [128, 3584, 1, 3328] - - [449, 7161.01] + - [453, 7161.01] - - [6784, 2368, 1, 1280] - - [498, 9464.31] + - [502, 9464.31] - - [4, 4288, 1, 128] - - [499, 132.58] + - [503, 132.58] - - [128, 704, 1, 1280] - - [461, 4463.75] + - [465, 4463.75] - - [3584, 2944, 1, 256] - - [498, 8201.14] + - [502, 8201.14] - - [1856, 128, 1, 3328] - - [440, 6575.4] + - [444, 6575.4] - - [4, 64, 1, 1280] - - [456, 43.5745] + - [460, 43.5745] - - [4, 5056, 1, 3328] - - [436, 675.215] + - [440, 675.215] - - [128, 2944, 1, 1280] - - [440, 5916.89] + - [444, 5916.89] - - [2368, 1024, 1, 3328] - - [493, 8646.74] + - [497, 8646.74] - - [128, 256, 1, 3328] - - [474, 4130.75] + - [478, 4130.75] - - [1408, 5056, 1, 3328] - - [492, 9529.65] + - [496, 9529.65] - - [1856, 1856, 1, 3328] - - [491, 8114.89] + - [495, 8114.89] - - [3584, 128, 1, 256] - - [487, 5603.08] + - [491, 5603.08] - - [448, 1408, 1, 3328] - - [487, 7072.93] + - [491, 7072.93] - - [2368, 2368, 1, 256] - - [494, 7648.66] + - [498, 7648.66] - - [4288, 4288, 1, 1280] - - [489, 9244.01] + - [493, 9244.01] - - [64, 448, 1, 1280] - - [460, 2885.23] + - [464, 2885.23] - - [1408, 4288, 1, 256] - - [487, 8080.31] + - [491, 8080.31] - - [448, 4, 1, 256] - - [505, 84.3294] + - [509, 84.3294] - - [5888, 448, 1, 128] - - [482, 3540.7] + - [486, 3540.7] - - [448, 4, 1, 1280] - - [456, 322.157] + - [460, 322.157] - - [704, 6784, 1, 3328] - - [486, 8613.48] + - [490, 8613.48] - - [5888, 5888, 1, 1280] - - [493, 9501.95] + - [497, 9501.95] - - [5056, 1024, 1, 1280] - - [496, 9110.01] + - [500, 9110.01] - - [448, 5888, 1, 3328] - - [487, 8586.33] + - [491, 8586.33] - - [128, 4, 1, 128] - - [499, 4.17959] + - [503, 4.17959] - - [1024, 2944, 1, 1280] - - [495, 7096.43] + - [499, 7096.43] - - [5056, 5888, 1, 1280] - - [486, 9693.41] + - [490, 9693.41] - - [4288, 5888, 1, 128] - - [479, 5406.36] + - [483, 5406.36] - - [256, 3584, 1, 256] - - [487, 6908.27] + - [491, 6908.27] - - [1408, 3584, 1, 128] - - [479, 4645.59] + - [483, 4645.59] - - [256, 2944, 1, 3328] - - [490, 6284.3] + - [494, 6284.3] - - [448, 3584, 1, 128] - - [482, 3675.27] + - [486, 3675.27] - - [5888, 2944, 1, 1280] - - [492, 9628.8] + - [496, 9628.8] - - [4, 6784, 1, 1280] - - [436, 688.076] + - [440, 688.076] - - [2368, 5888, 1, 128] - - [479, 5273.86] + - [483, 5273.86] - - [64, 2944, 1, 128] - - [423, 1316.44] + - [427, 1316.44] - - [3584, 5888, 1, 256] - - [493, 9239.04] + - [497, 9239.04] - - [2368, 704, 1, 128] - - [482, 3537.55] + - [486, 3537.55] - - [3584, 2944, 1, 1280] - - [487, 9324.52] + - [491, 9324.52] - - [3584, 2368, 1, 128] - - [479, 4766.24] + - [483, 4766.24] - - [5056, 704, 1, 128] - - [479, 4487.85] + - [483, 4487.85] - - [448, 2368, 1, 128] - - [482, 2876.92] + - [486, 2876.92] - - [5056, 1408, 1, 3328] - - [498, 9515.87] + - [502, 9515.87] - - [1408, 704, 1, 256] - - [490, 6836.08] + - [494, 6836.08] - - [6784, 1024, 1, 3328] - - [485, 9309.55] + - [489, 9309.55] - - [6784, 2944, 1, 3328] - - [486, 9536.48] + - [490, 9536.48] - - [2944, 5056, 1, 3328] - - [487, 9526.15] + - [491, 9526.15] - - [1856, 1856, 1, 256] - - [487, 5239.14] + - [491, 5239.14] - - [1024, 5888, 1, 128] - - [479, 4006.18] + - [483, 4006.18] - - [2048, 7133, 1, 2048] - - [485, 9827.97] + - [489, 9827.97] - - [256, 4, 1, 128] - - [500, 4.28908] + - [504, 4.28908] - - [4288, 5888, 1, 1280] - - [495, 9202.73] + - [499, 9202.73] - - [4288, 4288, 1, 256] - - [490, 5521.08] + - [494, 5521.08] - - [448, 2944, 1, 3328] - - [493, 7724.43] + - [497, 7724.43] - - [4288, 1856, 1, 1280] - - [493, 8826.24] + - [497, 8826.24] - - [1856, 2944, 1, 3328] - - [487, 9194.8] + - [491, 9194.8] - - [256, 6784, 1, 3328] - - [487, 8740.23] + - [491, 8740.23] - - [64, 5888, 1, 256] - - [487, 4766.25] + - [491, 4766.25] - - [256, 5056, 1, 128] - - [479, 2937.5] + - [483, 2937.5] - - [5056, 1024, 1, 256] - - [498, 5467.81] + - [502, 5467.81] - - [704, 64, 1, 3328] - - [475, 4818.33] + - [479, 4818.33] - - [5056, 1856, 1, 3328] - - [492, 8861.59] + - [496, 8861.59] - - [4, 2944, 1, 3328] - - [442, 662.002] + - [446, 662.002] - - [4, 5056, 1, 256] - - [502, 494.021] + - [506, 494.021] - - [1856, 1408, 1, 256] - - [487, 8674.68] + - [491, 8674.68] - - [3584, 4, 1, 128] - - [499, 108.196] + - [503, 108.196] - - [448, 448, 1, 3328] - - [461, 6457.3] + - [465, 6457.3] - - [6784, 128, 1, 3328] - - [454, 7256.61] + - [458, 7256.61] - - [4288, 1408, 1, 128] - - [482, 4791.66] + - [486, 4791.66] - - [4288, 5056, 1, 256] - - [487, 8560.74] + - [491, 8560.74] - - [1408, 128, 1, 1280] - - [469, 5085.69] + - [473, 5085.69] - - [5056, 256, 1, 3328] - - [490, 7284.13] + - [494, 7284.13] - - [704, 704, 1, 256] - - [487, 6171.09] + - [491, 6171.09] - - [1024, 5888, 1, 1280] - - [492, 8852.79] + - [496, 8852.79] - - [6784, 2368, 1, 128] - - [480, 4729.2] + - [484, 4729.2] - - [4, 5056, 1, 1280] - - [453, 669.946] + - [457, 669.946] - - [64, 128, 1, 256] - - [455, 369.217] + - [459, 369.217] - - [128, 1856, 1, 1280] - - [449, 5549.03] + - [453, 5549.03] - - [5056, 3584, 1, 256] - - [493, 7115.74] + - [497, 7115.74] - - [1856, 1024, 1, 1280] - - [485, 8196.4] + - [489, 8196.4] - - [6784, 4288, 1, 1280] - - [486, 9509.56] + - [490, 9509.56] - - [1856, 1856, 1, 1280] - - [488, 5791.89] + - [492, 5791.89] - - [6784, 2944, 1, 128] - - [479, 5317.02] + - [483, 5317.02] - - [1408, 5056, 1, 1280] - - [488, 8980.63] + - [492, 8980.63] - - [4, 2368, 1, 3328] - - [453, 592.534] + - [457, 592.534] - - [5888, 1856, 1, 128] - - [478, 4600.1] + - [482, 4600.1] - - [448, 704, 1, 1280] - - [487, 2286.48] + - [491, 2286.48] - - [2368, 1024, 1, 128] - - [482, 3911.02] + - [486, 3911.02] - - [1024, 448, 1, 3328] - - [487, 7295.14] + - [491, 7295.14] - - [1856, 704, 1, 1280] - - [487, 8881.02] + - [491, 8881.02] - - [5056, 3584, 1, 128] - - [479, 4911.58] + - [483, 4911.58] - - [5888, 5888, 1, 3328] - - [495, 9243.8] + - [499, 9243.8] - - [6784, 1024, 1, 256] - - [498, 5475.31] + - [502, 5475.31] - - [2944, 2368, 1, 256] - - [493, 5670.67] + - [497, 5670.67] - - [256, 448, 1, 256] - - [444, 2293.76] + - [448, 2293.76] - - [5056, 5888, 1, 3328] - - [488, 7847.97] + - [492, 7847.97] - - [1856, 1024, 1, 256] - - [493, 7517.6] + - [497, 7517.6] - - [448, 1408, 1, 1280] - - [487, 6917.44] + - [491, 6917.44] - - [3584, 448, 1, 1280] - - [493, 7980.76] + - [497, 7980.76] - - [1024, 1024, 1, 1280] - - [490, 8384.42] + - [494, 8384.42] - - [448, 5888, 1, 256] - - [487, 7365.65] + - [491, 7365.65] - - [704, 64, 1, 128] - - [432, 358.655] + - [436, 358.655] - - [1408, 6784, 1, 3328] - - [493, 9094.09] + - [497, 9094.09] - - [448, 1024, 1, 128] - - [482, 1772.95] + - [486, 1772.95] - - [4288, 704, 1, 128] - - [479, 4355.28] + - [483, 4355.28] - - [128, 1856, 1, 128] - - [418, 1610.63] + - [422, 1610.63] - - [448, 2368, 1, 3328] - - [493, 7366.37] + - [497, 7366.37] - - [5056, 64, 1, 128] - - [418, 2157.23] + - [422, 2157.23] - - [5056, 2944, 1, 256] - - [487, 9123.06] + - [491, 9123.06] - - [6784, 5888, 1, 128] - - [478, 5285.8] + - [482, 5285.8] - - [704, 1024, 1, 256] - - [493, 6667.25] + - [497, 6667.25] - - [1024, 4, 1, 256] - - [442, 187.246] + - [446, 187.246] - - [2368, 1856, 1, 256] - - [493, 6777.84] + - [497, 6777.84] - - [128, 6784, 1, 1280] - - [490, 7052.61] + - [494, 7052.61] - - [1408, 3584, 1, 3328] - - [494, 9037.95] + - [498, 9037.95] - - [2368, 6784, 1, 256] - - [487, 9181.35] + - [491, 9181.35] - - [5056, 1408, 1, 1280] - - [492, 9421.9] + - [496, 9421.9] - - [256, 256, 1, 128] - - [424, 543.304] + - [428, 543.304] - - [5056, 4288, 1, 128] - - [482, 5339.92] + - [486, 5339.92] - - [1408, 1856, 1, 128] - - [479, 4270.89] + - [483, 4270.89] - - [1408, 5888, 1, 3328] - - [491, 9034.79] + - [495, 9034.79] - - [1856, 256, 1, 256] - - [493, 5847.83] + - [497, 5847.83] - - [6784, 6784, 1, 256] - - [486, 9624.38] + - [490, 9624.38] - - [64, 256, 1, 128] - - [425, 146.449] + - [429, 146.449] - - [4288, 2368, 1, 128] - - [478, 3896.94] + - [482, 3896.94] - - [1856, 4288, 1, 128] - - [479, 4337.07] + - [483, 4337.07] - - [256, 4288, 1, 1280] - - [487, 7499.42] + - [491, 7499.42] - - [2368, 2944, 1, 256] - - [492, 7703.18] + - [496, 7703.18] - - [4, 1856, 1, 256] - - [505, 263.964] + - [509, 263.964] - - [3584, 1856, 1, 1280] - - [487, 9224.33] + - [491, 9224.33] - - [6784, 6784, 1, 128] - - [479, 5476.03] + - [483, 5476.03] - - [256, 1856, 1, 128] - - [482, 1858.72] + - [486, 1858.72] - - [704, 64, 1, 1280] - - [460, 3368.67] + - [464, 3368.67] - - [5888, 5056, 1, 256] - - [493, 5859.81] + - [497, 5859.81] - - [3584, 448, 1, 256] - - [493, 7298.33] + - [497, 7298.33] - - [448, 4288, 1, 128] - - [479, 3813.45] + - [483, 3813.45] - - [2944, 4288, 1, 3328] - - [488, 9149.63] + - [492, 9149.63] - - [256, 6784, 1, 256] - - [487, 7984.85] + - [491, 7984.85] - - [1408, 4288, 1, 128] - - [482, 4728.34] + - [486, 4728.34] - - [2944, 704, 1, 3328] - - [493, 7149.76] + - [497, 7149.76] - - [128, 448, 1, 256] - - [459, 1699.08] + - [463, 1699.08] - - [512, 32, 1, 512] - - [459, 1127.5] + - [463, 1127.5] - - [3584, 3584, 1, 256] - - [488, 8558.01] + - [492, 8558.01] - - [448, 1408, 1, 128] - - [479, 2504.35] + - [483, 2504.35] - - [128, 256, 1, 1280] - - [460, 3216.49] + - [464, 3216.49] - - [3584, 5056, 1, 256] - - [485, 5674.35] + - [489, 5674.35] - - [6784, 128, 1, 256] - - [487, 6216.39] + - [491, 6216.39] - - [4288, 4, 1, 256] - - [503, 435.606] + - [507, 435.606] - - [64, 1408, 1, 3328] - - [461, 6185.91] + - [465, 6185.91] - - [704, 448, 1, 256] - - [493, 4004.98] + - [497, 4004.98] - - [2944, 2368, 1, 1280] - - [494, 8542.7] + - [498, 8542.7] - - [448, 64, 1, 3328] - - [474, 3835.23] + - [478, 3835.23] - - [1408, 3584, 1, 256] - - [487, 8714.53] + - [491, 8714.53] - - [3584, 4, 1, 3328] - - [442, 689.454] + - [446, 689.454] - - [6784, 3584, 1, 256] - - [492, 9271.24] + - [496, 9271.24] - - [256, 128, 1, 128] - - [425, 283.399] + - [429, 283.399] - - [704, 1408, 1, 128] - - [479, 3210.47] + - [483, 3210.47] - - [4, 2368, 1, 256] - - [505, 360.838] + - [509, 360.838] - - [2944, 448, 1, 128] - - [479, 3344.31] + - [483, 3344.31] - - [128, 1408, 1, 256] - - [487, 3186.28] + - [491, 3186.28] - - [4, 2944, 1, 256] - - [503, 384.522] + - [507, 384.522] - - [64, 128, 1, 3328] - - [456, 2103.62] + - [460, 2103.62] - - [5056, 2368, 1, 128] - - [479, 5219.66] + - [483, 5219.66] - - [2944, 2944, 1, 3328] - - [496, 9174.59] + - [500, 9174.59] - - [5056, 6784, 1, 256] - - [498, 8992.26] + - [502, 8992.26] - - [1856, 3584, 1, 128] - - [479, 4957.17] + - [483, 4957.17] - - [128, 2944, 1, 128] - - [417, 2241.38] + - [421, 2241.38] - - [1024, 704, 1, 3328] - - [497, 6545.01] + - [501, 6545.01] - - [6784, 448, 1, 256] - - [493, 5379.15] + - [497, 5379.15] - - [3584, 6784, 1, 128] - - [479, 5101.91] + - [483, 5101.91] - - [128, 4288, 1, 256] - - [487, 5211.76] + - [491, 5211.76] - - [704, 448, 1, 3328] - - [488, 4504.05] + - [492, 4504.05] - - [1024, 1024, 1, 3328] - - [490, 8009.67] + - [494, 8009.67] - - [128, 128, 1, 3328] - - [473, 3184.93] + - [477, 3184.93] - - [5056, 1856, 1, 256] - - [487, 9138.33] + - [491, 9138.33] - - [256, 128, 1, 256] - - [459, 1205.26] + - [463, 1205.26] - - [1024, 1856, 1, 256] - - [498, 6374.99] + - [502, 6374.99] - - [4288, 64, 1, 128] - - [415, 1695.33] + - [419, 1695.33] - - [256, 448, 1, 3328] - - [462, 5659.57] + - [466, 5659.57] - - [1408, 6784, 1, 1280] - - [487, 9349.1] + - [491, 9349.1] - - [3584, 3584, 1, 1280] - - [492, 9302.09] + - [496, 9302.09] - - [64, 2368, 1, 1280] - - [461, 4432.97] + - [465, 4432.97] - - [448, 2368, 1, 1280] - - [487, 7250.67] + - [491, 7250.67] - - [5888, 5888, 1, 128] - - [479, 4615.93] + - [483, 4615.93] - - [64, 6784, 1, 3328] - - [493, 6987.13] + - [497, 6987.13] - - [2944, 256, 1, 1280] - - [496, 6127.35] + - [500, 6127.35] - - [5056, 5888, 1, 128] - - [478, 5106.29] + - [482, 5106.29] - - [256, 2368, 1, 128] - - [479, 2141.13] + - [483, 2141.13] - - [5056, 2368, 1, 3328] - - [490, 9041.65] + - [494, 9041.65] - - [2944, 4288, 1, 256] - - [498, 8691.12] + - [502, 8691.12] - - [1408, 3584, 1, 1280] - - [487, 9069.9] + - [491, 9069.9] - - [2368, 64, 1, 256] - - [459, 2412.77] + - [463, 2412.77] - - [64, 448, 1, 3328] - - [474, 3739.04] + - [478, 3739.04] - - [256, 256, 1, 3328] - - [461, 5304.08] + - [465, 5304.08] - - [5888, 4, 1, 128] - - [500, 105.555] + - [504, 105.555] - - [1856, 704, 1, 256] - - [487, 8025.33] + - [491, 8025.33] - - [4, 4288, 1, 1280] - - [434, 578.97] + - [438, 578.97] - - [1408, 448, 1, 3328] - - [495, 5714.41] + - [499, 5714.41] - - [1024, 4, 1, 3328] - - [453, 608.549] + - [457, 608.549] - - [2368, 256, 1, 256] - - [493, 5172.98] + - [497, 5172.98] - - [2368, 6784, 1, 3328] - - [493, 9456.51] + - [497, 9456.51] - - [1856, 1408, 1, 1280] - - [498, 7805.09] + - [502, 7805.09] - - [1856, 448, 1, 1280] - - [485, 6184.94] + - [489, 6184.94] - - [6784, 704, 1, 128] - - [479, 4597.77] + - [483, 4597.77] - - [4, 4, 1, 256] - - [456, 0.691892] + - [460, 0.691892] - - [128, 5888, 1, 128] - - [417, 2691.66] + - [421, 2691.66] - - [1408, 5888, 1, 256] - - [492, 7164.17] + - [496, 7164.17] - - [704, 2944, 1, 1280] - - [494, 8139.71] + - [498, 8139.71] - - [1856, 2368, 1, 128] - - [482, 4623.28] + - [486, 4623.28] - - [4096, 7133, 1, 4096] - - [486, 9939.97] + - [490, 9939.97] - - [256, 64, 1, 256] - - [450, 689.853] + - [454, 689.853] - - [1024, 1024, 1, 256] - - [493, 7216.01] + - [497, 7216.01] - - [704, 1856, 1, 256] - - [493, 6364.07] + - [497, 6364.07] - - [128, 4288, 1, 3328] - - [449, 7200.49] + - [453, 7200.49] - - [3584, 704, 1, 1280] - - [496, 7971.98] + - [500, 7971.98] - - [256, 128, 1, 1280] - - [447, 2702.52] + - [451, 2702.52] - - [2368, 4, 1, 256] - - [442, 325.918] + - [446, 325.918] - - [256, 2368, 1, 1280] - - [487, 6638.83] + - [491, 6638.83] - - [2944, 6784, 1, 128] - - [478, 5233.43] + - [482, 5233.43] - - [3584, 448, 1, 3328] - - [487, 8094.3] + - [491, 8094.3] - - [1408, 4, 1, 256] - - [505, 243.546] + - [509, 243.546] - - [704, 2368, 1, 3328] - - [487, 8403.01] + - [491, 8403.01] - - [2944, 448, 1, 256] - - [487, 7022.49] + - [491, 7022.49] - - [1856, 448, 1, 128] - - [482, 2842.69] + - [486, 2842.69] - - [2368, 128, 1, 1280] - - [469, 5685.42] + - [473, 5685.42] - - [256, 5888, 1, 128] - - [484, 2178.61] + - [488, 2178.61] - - [64, 6784, 1, 256] - - [487, 5385.13] + - [491, 5385.13] - - [64, 5056, 1, 1280] - - [461, 5603.19] + - [465, 5603.19] - - [4, 6784, 1, 128] - - [499, 180.156] + - [503, 180.156] - - [2944, 2944, 1, 1280] - - [496, 9129.29] + - [500, 9129.29] - - [5888, 2368, 1, 256] - - [498, 6961.59] + - [502, 6961.59] - - [4, 3584, 1, 1280] - - [442, 646.13] + - [446, 646.13] - - [1408, 128, 1, 128] - - [428, 1172.19] + - [432, 1172.19] - - [6784, 704, 1, 3328] - - [493, 9084.52] + - [497, 9084.52] - - [128, 64, 1, 1280] - - [472, 1260.31] + - [476, 1260.31] - - [2368, 256, 1, 1280] - - [493, 6643.38] + - [497, 6643.38] - - [4, 448, 1, 3328] - - [456, 433.414] + - [460, 433.414] - - [5888, 4288, 1, 128] - - [480, 4753.07] + - [484, 4753.07] - - [4, 5888, 1, 256] - - [442, 471.04] + - [446, 471.04] - - [1408, 2944, 1, 3328] - - [496, 9207.0] + - [500, 9207.0] - - [3584, 704, 1, 128] - - [482, 3762.36] + - [486, 3762.36] - - [64, 1024, 1, 256] - - [460, 1807.89] + - [464, 1807.89] - - [5056, 5056, 1, 128] - - [483, 4830.06] + - [487, 4830.06] - - [2368, 448, 1, 1280] - - [487, 7263.06] + - [491, 7263.06] - - [128, 3584, 1, 256] - - [490, 4369.07] + - [494, 4369.07] - - [704, 448, 1, 1280] - - [488, 4205.23] + - [492, 4205.23] - - [448, 5056, 1, 128] - - [479, 3855.47] + - [483, 3855.47] - - [256, 4, 1, 1280] - - [510, 157.538] + - [514, 157.538] - - [128, 5056, 1, 256] - - [493, 6108.96] + - [497, 6108.96] - - [1408, 5056, 1, 128] - - [482, 4836.58] + - [486, 4836.58] - - [2944, 3584, 1, 128] - - [482, 4532.09] + - [486, 4532.09] - - [3584, 2368, 1, 256] - - [487, 8951.24] + - [491, 8951.24] - - [5888, 5056, 1, 1280] - - [498, 9276.39] + - [502, 9276.39] - - [2368, 5056, 1, 128] - - [482, 5167.56] + - [486, 5167.56] - - [64, 704, 1, 256] - - [442, 1501.87] + - [446, 1501.87] - - [4288, 256, 1, 1280] - - [487, 7496.2] + - [491, 7496.2] - - [3584, 3584, 1, 3328] - - [488, 9301.67] + - [492, 9301.67] - - [1024, 256, 1, 128] - - [479, 1508.74] + - [483, 1508.74] - - [4, 704, 1, 128] - - [500, 12.0469] + - [504, 12.0469] - - [5888, 6784, 1, 256] - - [486, 9370.37] + - [490, 9370.37] - - [4288, 2944, 1, 3328] - - [490, 9148.99] + - [494, 9148.99] - - [2944, 64, 1, 128] - - [426, 1456.36] + - [430, 1456.36] - - [1856, 64, 1, 256] - - [452, 2209.93] + - [456, 2209.93] - - [4288, 128, 1, 3328] - - [446, 6471.85] + - [450, 6471.85] - - [4288, 704, 1, 1280] - - [493, 8934.51] + - [497, 8934.51] - - [256, 5056, 1, 1280] - - [487, 8439.03] + - [491, 8439.03] - - [1408, 256, 1, 128] - - [482, 1769.07] + - [486, 1769.07] - - [2944, 5888, 1, 3328] - - [487, 9447.94] + - [491, 9447.94] - - [6784, 5888, 1, 1280] - - [498, 9372.15] + - [502, 9372.15] - - [704, 128, 1, 256] - - [444, 2059.7] + - [448, 2059.7] - - [5888, 4288, 1, 1280] - - [490, 9244.22] + - [494, 9244.22] - - [448, 256, 1, 1280] - - [469, 4741.62] + - [473, 4741.62] - - [5888, 3584, 1, 128] - - [478, 4979.96] + - [482, 4979.96] - - [1856, 1856, 1, 128] - - [482, 4363.88] + - [486, 4363.88] - - [5056, 4, 1, 1280] - - [502, 629.541] + - [506, 629.541] - - [256, 1408, 1, 1280] - - [493, 5588.34] + - [497, 5588.34] - - [512, 16, 1, 512] - - [453, 689.853] + - [457, 689.853] - - [704, 3584, 1, 128] - - [482, 4069.57] + - [486, 4069.57] - - [5888, 448, 1, 3328] - - [498, 7925.84] + - [502, 7925.84] - - [2368, 4288, 1, 1280] - - [497, 8492.6] + - [501, 8492.6] - - [4288, 2944, 1, 128] - - [479, 5238.11] + - [483, 5238.11] - - [1024, 6784, 1, 3328] - - [493, 8578.08] + - [497, 8578.08] - - [128, 2368, 1, 256] - - [493, 3788.8] + - [497, 3788.8] - - [6784, 64, 1, 3328] - - [487, 7003.36] + - [491, 7003.36] - - [5056, 2944, 1, 3328] - - [490, 8575.35] + - [494, 8575.35] - - [448, 128, 1, 256] - - [442, 1714.96] + - [446, 1714.96] - - [2944, 3584, 1, 256] - - [487, 8994.16] + - [491, 8994.16] - - [1408, 1408, 1, 3328] - - [485, 8757.6] + - [489, 8757.6] - - [1856, 128, 1, 1280] - - [487, 5598.07] + - [491, 5598.07] - - [3584, 3584, 1, 128] - - [478, 4787.34] + - [482, 4787.34] - - [64, 3584, 1, 256] - - [493, 3545.91] + - [497, 3545.91] - - [1408, 4, 1, 3328] - - [437, 640.14] + - [441, 640.14] - - [128, 2944, 1, 3328] - - [461, 7204.14] + - [465, 7204.14] - - [3584, 704, 1, 256] - - [487, 6239.59] + - [491, 6239.59] - - [2944, 448, 1, 3328] - - [493, 7726.61] + - [497, 7726.61] - - [3584, 1408, 1, 3328] - - [485, 9358.68] + - [489, 9358.68] - - [704, 3584, 1, 1280] - - [493, 8005.18] + - [497, 8005.18] - - [2944, 6784, 1, 1280] - - [485, 9487.63] + - [489, 9487.63] - - [1856, 6784, 1, 256] - - [487, 5684.46] + - [491, 5684.46] - - [4288, 448, 1, 3328] - - [493, 8410.28] + - [497, 8410.28] - - [6784, 4288, 1, 128] - - [483, 4785.48] + - [487, 4785.48] - - [6784, 704, 1, 1280] - - [487, 5578.95] + - [491, 5578.95] - - [256, 4288, 1, 256] - - [487, 6781.33] + - [491, 6781.33] - - [3584, 64, 1, 128] - - [426, 1473.9] + - [430, 1473.9] - - [5888, 1024, 1, 3328] - - [485, 8639.39] + - [489, 8639.39] - - [448, 64, 1, 128] - - [417, 259.182] + - [421, 259.182] - - [704, 6784, 1, 1280] - - [493, 9027.15] + - [497, 9027.15] - - [5888, 128, 1, 256] - - [493, 6812.78] + - [497, 6812.78] - - [2368, 448, 1, 3328] - - [493, 7356.53] + - [497, 7356.53] - - [1856, 5056, 1, 3328] - - [492, 8871.46] + - [496, 8871.46] - - [4, 6784, 1, 256] - - [501, 469.379] + - [505, 469.379] - - [1024, 3584, 1, 128] - - [479, 3427.92] + - [483, 3427.92] - - [1024, 1408, 1, 128] - - [482, 2934.95] + - [486, 2934.95] - - [2368, 2944, 1, 128] - - [482, 4887.92] + - [486, 4887.92] - - [5056, 64, 1, 256] - - [451, 3186.06] + - [455, 3186.06] - - [4, 448, 1, 1280] - - [456, 273.067] + - [460, 273.067] - - [5056, 2944, 1, 128] - - [483, 4752.69] + - [487, 4752.69] - - [5888, 5056, 1, 3328] - - [497, 9124.67] + - [501, 9124.67] - - [1024, 704, 1, 128] - - [482, 2302.26] + - [486, 2302.26] - - [1408, 2368, 1, 128] - - [482, 3826.85] + - [486, 3826.85] - - [5888, 2368, 1, 128] - - [479, 4912.67] + - [483, 4912.67] - - [128, 5056, 1, 3328] - - [469, 7583.7] + - [473, 7583.7] - - [3584, 6784, 1, 1280] - - [496, 9313.4] + - [500, 9313.4] - - [3072, 7435, 1, 1024] - - [490, 9321.97] + - [494, 9321.97] - - [1856, 5888, 1, 256] - - [487, 5778.24] + - [491, 5778.24] - - [256, 256, 1, 256] - - [439, 1576.81] + - [443, 1576.81] - - [256, 64, 1, 128] - - [425, 173.605] + - [429, 173.605] - - [4288, 4288, 1, 3328] - - [492, 8416.17] + - [496, 8416.17] - - [4288, 1408, 1, 1280] - - [498, 9301.87] + - [502, 9301.87] - - [3584, 5056, 1, 128] - - [484, 4344.84] + - [488, 4344.84] - - [4, 1024, 1, 3328] - - [453, 615.139] + - [457, 615.139] - - [4288, 2368, 1, 256] - - [487, 9142.57] + - [491, 9142.57] - - [2944, 5056, 1, 1280] - - [487, 9399.59] + - [491, 9399.59] - - [448, 6784, 1, 256] - - [486, 5710.83] + - [490, 5710.83] - - [64, 1024, 1, 3328] - - [469, 4975.0] + - [473, 4975.0] - - [6784, 2368, 1, 3328] - - [496, 9207.53] + - [500, 9207.53] - - [256, 1024, 1, 1280] - - [493, 5983.32] + - [497, 5983.32] - - [704, 4, 1, 128] - - [499, 15.0187] + - [503, 15.0187] - - [256, 4, 1, 256] - - [456, 52.8516] + - [460, 52.8516] - - [4288, 128, 1, 256] - - [487, 5242.88] + - [491, 5242.88] - - [4288, 1856, 1, 3328] - - [498, 9353.96] + - [502, 9353.96] - - [3584, 448, 1, 128] - - [479, 3353.8] + - [483, 3353.8] - - [256, 4, 1, 3328] - - [510, 313.224] + - [514, 313.224] - - [4, 1408, 1, 1280] - - [453, 509.107] + - [457, 509.107] - - [3584, 64, 1, 1280] - - [441, 5198.32] + - [445, 5198.32] - - [1408, 448, 1, 128] - - [479, 2628.27] + - [483, 2628.27] - - [3584, 1024, 1, 1280] - - [493, 8534.91] + - [497, 8534.91] - - [1856, 5056, 1, 256] - - [485, 8184.39] + - [489, 8184.39] - - [4, 3584, 1, 256] - - [503, 395.476] + - [507, 395.476] - - [1024, 4288, 1, 256] - - [488, 5966.42] + - [492, 5966.42] - - [5888, 3584, 1, 3328] - - [491, 9189.33] + - [495, 9189.33] - - [4, 256, 1, 256] - - [507, 41.4785] + - [511, 41.4785] - - [5056, 3584, 1, 3328] - - [492, 9431.82] + - [496, 9431.82] - - [128, 5888, 1, 1280] - - [487, 8192.0] + - [491, 8192.0] - - [704, 448, 1, 128] - - [479, 1510.86] + - [483, 1510.86] - - [2368, 1408, 1, 1280] - - [487, 8415.55] + - [491, 8415.55] - - [5056, 2944, 1, 1280] - - [498, 9294.67] + - [502, 9294.67] - - [4, 4, 1, 128] - - [500, 0.0356549] + - [504, 0.0356549] - - [3584, 256, 1, 256] - - [487, 6749.45] + - [491, 6749.45] - - [128, 1856, 1, 3328] - - [440, 6796.99] + - [444, 6796.99] - - [1024, 6784, 1, 256] - - [493, 8782.99] + - [497, 8782.99] - - [4, 128, 1, 256] - - [453, 27.3067] + - [457, 27.3067] - - [64, 64, 1, 1280] - - [472, 712.348] + - [476, 712.348] - - [6784, 4, 1, 128] - - [500, 121.96] + - [504, 121.96] - - [2944, 1408, 1, 128] - - [482, 4430.36] + - [486, 4430.36] - - [448, 128, 1, 3328] - - [469, 5097.24] + - [473, 5097.24] - - [64, 2944, 1, 3328] - - [469, 6362.1] + - [473, 6362.1] - - [64, 4288, 1, 3328] - - [469, 6564.91] + - [473, 6564.91] - - [5056, 6784, 1, 3328] - - [493, 8121.08] + - [497, 8121.08] - - [128, 2944, 1, 256] - - [487, 4692.07] + - [491, 4692.07] - - [128, 6784, 1, 128] - - [416, 2687.36] + - [420, 2687.36] - - [3584, 4288, 1, 256] - - [493, 9193.89] + - [497, 9193.89] - - [448, 1856, 1, 256] - - [493, 6231.29] + - [497, 6231.29] - - [1856, 6784, 1, 3328] - - [498, 9191.38] + - [502, 9191.38] - - [3584, 128, 1, 3328] - - [487, 7368.37] + - [491, 7368.37] - - [64, 1856, 1, 256] - - [438, 2184.53] + - [442, 2184.53] - - [1024, 448, 1, 1280] - - [493, 6977.22] + - [497, 6977.22] - - [5888, 4288, 1, 256] - - [493, 5780.4] + - [497, 5780.4] - - [4, 448, 1, 128] - - [500, 8.96] + - [504, 8.96] - - [5056, 1408, 1, 256] - - [487, 5601.25] + - [491, 5601.25] - - [64, 256, 1, 1280] - - [453, 1927.53] + - [457, 1927.53] - - [3584, 1024, 1, 256] - - [498, 7542.74] + - [502, 7542.74] - - [256, 704, 1, 256] - - [487, 2957.52] + - [491, 2957.52] - - [5888, 5888, 1, 256] - - [498, 7344.04] + - [502, 7344.04] - - [4288, 1024, 1, 1280] - - [493, 8925.74] + - [497, 8925.74] - - [5888, 128, 1, 3328] - - [487, 8409.97] + - [491, 8409.97] - - [448, 6784, 1, 3328] - - [487, 8862.46] + - [491, 8862.46] - - [2944, 1408, 1, 1280] - - [498, 7478.83] + - [502, 7478.83] - - [1024, 32, 1, 512] - - [442, 1777.25] + - [446, 1777.25] - - [2944, 1856, 1, 3328] - - [487, 9153.33] + - [491, 9153.33] - - [2368, 64, 1, 128] - - [426, 1102.2] + - [430, 1102.2] - - [2944, 2944, 1, 128] - - [478, 4591.85] + - [482, 4591.85] - - [4, 128, 1, 3328] - - [508, 118.99] + - [512, 118.99] - - [3584, 5888, 1, 1280] - - [487, 9222.39] + - [491, 9222.39] - - [64, 4, 1, 128] - - [499, 0.93516] + - [503, 0.93516] - - [6784, 1856, 1, 1280] - - [487, 9135.97] + - [491, 9135.97] - - [2944, 5056, 1, 256] - - [493, 8860.03] + - [497, 8860.03] - - [2944, 5888, 1, 1280] - - [486, 9643.53] + - [490, 9643.53] - - [5888, 256, 1, 3328] - - [493, 8799.43] + - [497, 8799.43] - - [1856, 5888, 1, 3328] - - [493, 9457.43] + - [497, 9457.43] - - [3584, 1408, 1, 256] - - [493, 8672.43] + - [497, 8672.43] - - [704, 3584, 1, 3328] - - [493, 8525.2] + - [497, 8525.2] - - [5056, 448, 1, 1280] - - [493, 8843.67] + - [497, 8843.67] - - [3584, 1856, 1, 3328] - - [485, 8881.43] + - [489, 8881.43] - - [64, 1408, 1, 128] - - [414, 747.042] + - [418, 747.042] - - [1408, 704, 1, 1280] - - [487, 8342.83] + - [491, 8342.83] - - [2944, 1024, 1, 256] - - [498, 8079.48] + - [502, 8079.48] - - [1024, 2368, 1, 128] - - [482, 3347.48] + - [486, 3347.48] - - [2368, 4288, 1, 3328] - - [493, 9467.57] + - [497, 9467.57] - - [4, 1408, 1, 256] - - [505, 257.463] + - [509, 257.463] - - [1024, 1408, 1, 1280] - - [493, 8241.74] + - [497, 8241.74] - - [64, 64, 1, 256] - - [453, 189.959] + - [457, 189.959] - - [704, 256, 1, 3328] - - [487, 4519.18] + - [491, 4519.18] - - [6784, 5056, 1, 256] - - [486, 9133.68] + - [490, 9133.68] - - [4, 4288, 1, 3328] - - [437, 669.975] + - [441, 669.975] - - [448, 6784, 1, 128] - - [479, 4481.82] + - [483, 4481.82] - - [4, 704, 1, 3328] - - [509, 522.971] + - [513, 522.971] - - [448, 2944, 1, 256] - - [487, 7022.49] + - [491, 7022.49] - - [2944, 6784, 1, 256] - - [493, 9199.74] + - [497, 9199.74] - - [2368, 2368, 1, 1280] - - [498, 8646.74] + - [502, 8646.74] - - [4, 4, 1, 1280] - - [456, 3.01176] + - [460, 3.01176] - - [1856, 3584, 1, 1280] - - [485, 8805.35] + - [489, 8805.35] - - [64, 2944, 1, 256] - - [459, 2565.66] + - [463, 2565.66] - - [3584, 1408, 1, 1280] - - [498, 9273.02] + - [502, 9273.02] - - [448, 256, 1, 128] - - [414, 941.03] + - [418, 941.03] - - [4288, 448, 1, 128] - - [480, 3215.1] + - [484, 3215.1] - - [5056, 256, 1, 1280] - - [493, 8790.03] + - [497, 8790.03] - - [1856, 1408, 1, 3328] - - [487, 9310.63] + - [491, 9310.63] - - [128, 128, 1, 128] - - [422, 155.115] + - [426, 155.115] - - [1024, 4288, 1, 3328] - - [490, 8528.02] + - [494, 8528.02] - - [448, 2368, 1, 256] - - [494, 5097.24] + - [498, 5097.24] - - [1024, 4, 1, 128] - - [500, 10.2721] + - [504, 10.2721] - - [5056, 448, 1, 256] - - [493, 8236.68] + - [497, 8236.68] - - [2944, 2368, 1, 3328] - - [486, 9331.06] + - [490, 9331.06] - - [704, 128, 1, 3328] - - [461, 5969.2] + - [465, 5969.2] - - [64, 64, 1, 3328] - - [477, 1494.68] + - [481, 1494.68] - - [1024, 1856, 1, 1280] - - [492, 6356.33] + - [496, 6356.33] - - [6784, 1856, 1, 256] - - [493, 9068.53] + - [497, 9068.53] - - [128, 2368, 1, 3328] - - [469, 6714.12] + - [473, 6714.12] - - [1024, 5888, 1, 256] - - [493, 5501.5] + - [497, 5501.5] - - [5056, 128, 1, 1280] - - [449, 6455.54] + - [453, 6455.54] - - [5056, 64, 1, 3328] - - [454, 6703.71] + - [458, 6703.71] - - [128, 704, 1, 128] - - [415, 696.518] + - [419, 696.518] - - [1408, 2368, 1, 256] - - [487, 8667.15] + - [491, 8667.15] - - [1408, 1408, 1, 256] - - [498, 7615.71] + - [502, 7615.71] - - [4, 64, 1, 128] - - [500, 0.98463] + - [504, 0.98463] - - [64, 128, 1, 1280] - - [472, 1379.71] + - [476, 1379.71] - - [2368, 2368, 1, 128] - - [482, 4582.16] + - [486, 4582.16] - - [64, 5888, 1, 128] - - [415, 2086.27] + - [419, 2086.27] - - [5888, 4, 1, 3328] - - [436, 667.414] + - [440, 667.414] - - [6784, 1408, 1, 128] - - [483, 4516.24] + - [487, 4516.24] - - [4288, 5888, 1, 256] - - [498, 8497.33] + - [502, 8497.33] - - [1408, 5056, 1, 256] - - [487, 8867.36] + - [491, 8867.36] - - [5056, 128, 1, 3328] - - [469, 7678.88] + - [473, 7678.88] - - [128, 128, 1, 1280] - - [457, 2016.49] + - [461, 2016.49] - - [448, 704, 1, 256] - - [488, 3030.79] + - [492, 3030.79] - - [4288, 3584, 1, 128] - - [479, 5246.23] + - [483, 5246.23] - - [2944, 128, 1, 3328] - - [454, 6795.06] + - [458, 6795.06] - - [128, 5056, 1, 1280] - - [440, 6192.99] + - [444, 6192.99] - - [3584, 5056, 1, 1280] - - [492, 9499.07] + - [496, 9499.07] - - [256, 448, 1, 1280] - - [448, 4267.46] + - [452, 4267.46] - - [704, 704, 1, 128] - - [482, 2259.22] + - [486, 2259.22] - - [5056, 4, 1, 128] - - [500, 12.4313] + - [504, 12.4313] - - [704, 256, 1, 1280] - - [487, 4355.87] + - [491, 4355.87] - - [64, 2368, 1, 3328] - - [461, 6310.87] + - [465, 6310.87] - - [1856, 1024, 1, 128] - - [478, 4065.33] + - [482, 4065.33] - - [1856, 64, 1, 128] - - [417, 936.229] + - [421, 936.229] - - [64, 6784, 1, 1280] - - [440, 5731.7] + - [444, 5731.7] - - [704, 4288, 1, 256] - - [493, 5218.8] + - [497, 5218.8] - - [5888, 2368, 1, 1280] - - [487, 9378.8] + - [491, 9378.8] - - [128, 256, 1, 256] - - [457, 1219.27] + - [461, 1219.27] - - [256, 64, 1, 1280] - - [459, 1820.44] + - [463, 1820.44] - - [2368, 5888, 1, 1280] - - [498, 9143.54] + - [502, 9143.54] - - [5888, 256, 1, 1280] - - [487, 8678.37] + - [491, 8678.37] - - [4, 5888, 1, 1280] - - [434, 668.142] + - [438, 668.142] - - [704, 128, 1, 128] - - [422, 649.456] + - [426, 649.456] - - [1024, 4, 1, 1280] - - [453, 478.365] + - [457, 478.365] - - [2368, 1856, 1, 3328] - - [485, 8153.77] + - [489, 8153.77] - - [2368, 128, 1, 128] - - [420, 1858.11] + - [424, 1858.11] - - [2944, 704, 1, 256] - - [487, 8437.97] + - [491, 8437.97] - - [5056, 128, 1, 128] - - [416, 2689.53] + - [420, 2689.53] - - [256, 704, 1, 3328] - - [487, 4541.08] + - [491, 4541.08] - - [704, 3584, 1, 256] - - [488, 7770.97] + - [492, 7770.97] - - [1024, 1024, 1, 1024] - - [493, 8305.52] + - [497, 8305.52] - - [704, 2944, 1, 3328] - - [493, 9166.38] + - [497, 9166.38] - - [6784, 1024, 1, 128] - - [478, 4362.21] + - [482, 4362.21] - - [256, 448, 1, 128] - - [425, 899.514] + - [429, 899.514] - - [448, 1024, 1, 3328] - - [487, 7385.46] + - [491, 7385.46] - - [2944, 1024, 1, 3328] - - [490, 8779.71] + - [494, 8779.71] - - [2944, 5056, 1, 128] - - [482, 5103.01] + - [486, 5103.01] - - [1408, 6784, 1, 256] - - [493, 8346.79] + - [497, 8346.79] - - [6784, 1408, 1, 3328] - - [489, 8878.3] + - [493, 8878.3] - - [4288, 6784, 1, 128] - - [478, 5432.89] + - [482, 5432.89] - - [704, 64, 1, 256] - - [467, 1441.79] + - [471, 1441.79] - - [5888, 4, 1, 1280] - - [504, 636.541] + - [508, 636.541] - - [256, 2368, 1, 3328] - - [487, 6804.7] + - [491, 6804.7] - - [6784, 2944, 1, 1280] - - [486, 9472.16] + - [490, 9472.16] - - [4288, 1856, 1, 128] - - [482, 4886.28] + - [486, 4886.28] - - [1856, 2944, 1, 128] - - [479, 4642.86] + - [483, 4642.86] - - [6784, 448, 1, 128] - - [479, 4369.07] + - [483, 4369.07] - - [64, 3584, 1, 128] - - [426, 1645.75] + - [430, 1645.75] - - [448, 5056, 1, 1280] - - [487, 8553.54] + - [491, 8553.54] - - [2368, 1856, 1, 128] - - [479, 4741.75] + - [483, 4741.75] - - [128, 448, 1, 1280] - - [469, 3744.91] + - [473, 3744.91] - - [4288, 704, 1, 256] - - [487, 8444.06] + - [491, 8444.06] - - [256, 3584, 1, 128] - - [479, 2454.86] + - [483, 2454.86] - - [5888, 704, 1, 256] - - [487, 8819.47] + - [491, 8819.47] - - [3584, 1024, 1, 128] - - [482, 4094.86] + - [486, 4094.86] - - [256, 5888, 1, 3328] - - [496, 8538.23] + - [500, 8538.23] - - [1408, 4288, 1, 3328] - - [498, 9212.47] + - [502, 9212.47] - - [6784, 4288, 1, 256] - - [486, 9163.02] + - [490, 9163.02] - - [4288, 256, 1, 128] - - [479, 3081.34] + - [483, 3081.34] - - [5888, 256, 1, 256] - - [487, 7680.65] + - [491, 7680.65] - - [6784, 1024, 1, 1280] - - [498, 9248.53] + - [502, 9248.53] - - [5888, 1024, 1, 128] - - [482, 4061.84] + - [486, 4061.84] - - [1024, 128, 1, 256] - - [493, 2317.29] + - [497, 2317.29] - - [128, 64, 1, 3328] - - [476, 2116.69] + - [480, 2116.69] - - [448, 64, 1, 256] - - [459, 1079.42] + - [463, 1079.42] - - [2368, 256, 1, 128] - - [480, 2229.73] + - [484, 2229.73] - - [6784, 3584, 1, 1280] - - [493, 9096.5] + - [497, 9096.5] - - [1024, 6784, 1, 1280] - - [491, 9112.8] + - [495, 9112.8] - - [2944, 64, 1, 1280] - - [449, 4982.9] + - [453, 4982.9] - - [1408, 2944, 1, 1280] - - [488, 9131.53] + - [492, 9131.53] - - [256, 1856, 1, 256] - - [496, 4432.76] + - [500, 4432.76] - - [1408, 2368, 1, 3328] - - [496, 8449.08] + - [500, 8449.08] - - [2944, 4, 1, 3328] - - [442, 673.84] + - [446, 673.84] - - [128, 1408, 1, 3328] - - [461, 6582.37] + - [465, 6582.37] - - [2944, 1856, 1, 128] - - [479, 4827.44] + - [483, 4827.44] - - [256, 2944, 1, 128] - - [482, 2416.56] + - [486, 2416.56] - - [256, 6784, 1, 128] - - [482, 3118.66] + - [486, 3118.66] - - [2368, 4, 1, 128] - - [500, 22.6197] + - [504, 22.6197] - - [1408, 256, 1, 3328] - - [487, 3733.72] + - [491, 3733.72] - - [1856, 4, 1, 128] - - [499, 7.10009] + - [503, 7.10009] - - [1024, 16, 1, 512] - - [455, 1165.08] + - [459, 1165.08] - - [5056, 6784, 1, 128] - - [483, 4949.03] + - [487, 4949.03] - - [4288, 5056, 1, 128] - - [482, 4966.8] + - [486, 4966.8] - - [1856, 5888, 1, 128] - - [478, 4351.66] + - [482, 4351.66] - - [2944, 5888, 1, 256] - - [498, 8460.89] + - [502, 8460.89] - - [3584, 1856, 1, 256] - - [493, 8876.6] + - [497, 8876.6] - - [4288, 3584, 1, 1280] - - [486, 9603.6] + - [490, 9603.6] - - [2368, 448, 1, 256] - - [487, 6604.6] + - [491, 6604.6] - - [4288, 256, 1, 3328] - - [487, 7619.79] + - [491, 7619.79] - - [1856, 704, 1, 128] - - [479, 3629.51] + - [483, 3629.51] - - [1408, 64, 1, 256] - - [443, 2168.11] + - [447, 2168.11] - - [64, 1856, 1, 128] - - [419, 979.662] + - [423, 979.662] - - [4, 256, 1, 128] - - [500, 5.13595] + - [504, 5.13595] - - [704, 4288, 1, 3328] - - [493, 9014.42] + - [497, 9014.42] - - [704, 5888, 1, 128] - - [480, 4221.67] + - [484, 4221.67] - - [6784, 3584, 1, 128] - - [478, 5360.63] + - [482, 5360.63] - - [1024, 64, 1, 256] - - [438, 1588.75] + - [442, 1588.75] - - [64, 2368, 1, 256] - - [493, 2552.45] + - [497, 2552.45] - - [4288, 5056, 1, 3328] - - [492, 8193.28] + - [496, 8193.28] - - [4, 1856, 1, 1280] - - [442, 499.092] + - [446, 499.092] - - [4288, 128, 1, 128] - - [479, 2373.47] + - [483, 2373.47] - - [1408, 1408, 1, 128] - - [482, 3753.78] + - [486, 3753.78] - - [1024, 128, 1, 3328] - - [464, 5656.22] + - [468, 5656.22] - - [1856, 128, 1, 128] - - [415, 1617.48] + - [419, 1617.48] - - [5056, 2368, 1, 256] - - [498, 5553.31] + - [502, 5553.31] - - [4288, 704, 1, 3328] - - [486, 6961.96] + - [490, 6961.96] - - [448, 3584, 1, 256] - - [496, 5981.4] + - [500, 5981.4] - - [64, 128, 1, 128] - - [433, 74.8983] + - [437, 74.8983] - - [2368, 64, 1, 1280] - - [469, 5041.23] + - [473, 5041.23] - - [2368, 1024, 1, 1280] - - [494, 7740.87] + - [498, 7740.87] - - [2944, 1408, 1, 3328] - - [496, 9204.55] + - [500, 9204.55] - - [1408, 448, 1, 256] - - [493, 5954.3] + - [497, 5954.3] - - [1024, 1408, 1, 3328] - - [490, 8161.44] + - [494, 8161.44] - - [2560, 7133, 1, 2560] - - [485, 9636.59] + - [489, 9636.59] - - [1408, 4, 1, 1280] - - [437, 520.879] + - [441, 520.879] - - [5888, 3584, 1, 256] - - [498, 9225.16] + - [502, 9225.16] - - [128, 1024, 1, 1280] - - [440, 4755.45] + - [444, 4755.45] - - [1408, 1856, 1, 3328] - - [490, 9130.77] + - [494, 9130.77] - - [4, 4, 1, 3328] - - [510, 6.93333] + - [514, 6.93333] - - [6784, 1408, 1, 1280] - - [487, 9346.81] + - [491, 9346.81] - - [4, 1024, 1, 1280] - - [437, 422.813] + - [441, 422.813] - - [704, 2944, 1, 256] - - [493, 8331.96] + - [497, 8331.96] - - [704, 4288, 1, 128] - - [479, 4371.04] + - [483, 4371.04] - - [2368, 4288, 1, 128] - - [479, 3988.79] + - [483, 3988.79] - - [64, 4288, 1, 1280] - - [469, 5407.53] + - [473, 5407.53] - - [6784, 64, 1, 1280] - - [449, 5708.15] + - [453, 5708.15] - - [3584, 128, 1, 128] - - [415, 2463.1] + - [419, 2463.1] - - [1024, 6784, 1, 128] - - [480, 3862.02] + - [484, 3862.02] - - [4, 1856, 1, 128] - - [500, 30.5362] + - [504, 30.5362] - - [1408, 64, 1, 3328] - - [469, 6095.38] + - [473, 6095.38] - - [6784, 4, 1, 256] - - [502, 487.838] + - [506, 487.838] - - [1408, 1408, 1, 1280] - - [498, 8640.53] + - [502, 8640.53] - - [256, 2368, 1, 256] - - [490, 4282.26] + - [494, 4282.26] - - [448, 4288, 1, 3328] - - [487, 8516.03] + - [491, 8516.03] - - [2368, 1408, 1, 256] - - [493, 8632.09] + - [497, 8632.09] - - [5888, 5056, 1, 128] - - [479, 5091.01] + - [483, 5091.01] - - [704, 2368, 1, 256] - - [493, 7664.7] + - [497, 7664.7] - - [2944, 448, 1, 1280] - - [493, 7618.25] + - [497, 7618.25] - - [5888, 2368, 1, 3328] - - [496, 9343.38] + - [500, 9343.38] - - [64, 2944, 1, 1280] - - [461, 5162.08] + - [465, 5162.08] - - [448, 1856, 1, 1280] - - [487, 7027.9] + - [491, 7027.9] - - [4288, 448, 1, 1280] - - [487, 5855.66] + - [491, 5855.66] - - [5888, 704, 1, 3328] - - [496, 9190.81] + - [500, 9190.81] - - [5056, 256, 1, 128] - - [482, 3235.84] + - [486, 3235.84] - - [1856, 256, 1, 128] - - [480, 1849.68] + - [484, 1849.68] - - [5056, 128, 1, 256] - - [493, 6108.96] + - [497, 6108.96] - - [704, 4, 1, 256] - - [453, 125.156] + - [457, 125.156] - - [1408, 5888, 1, 128] - - [479, 5055.06] + - [483, 5055.06] - - [4288, 4, 1, 128] - - [499, 95.6209] + - [503, 95.6209] - - [1408, 1024, 1, 256] - - [487, 7370.18] + - [491, 7370.18] - - [1024, 1856, 1, 128] - - [479, 2966.7] + - [483, 2966.7] - - [256, 704, 1, 128] - - [481, 528.129] + - [485, 528.129] - - [256, 1024, 1, 128] - - [479, 1171.59] + - [483, 1171.59] - - [448, 1024, 1, 256] - - [493, 5624.55] + - [497, 5624.55] - - [128, 4, 1, 3328] - - [510, 191.885] + - [514, 191.885] - - [5056, 6784, 1, 1280] - - [487, 9543.97] + - [491, 9543.97] - - [704, 5056, 1, 3328] - - [494, 8790.25] + - [498, 8790.25] - - [64, 1408, 1, 1280] - - [461, 4505.6] + - [465, 4505.6] - - [3584, 5056, 1, 3328] - - [492, 9073.42] + - [496, 9073.42] - - [1856, 4, 1, 3328] - - [510, 612.775] + - [514, 612.775] - - [4, 2944, 1, 128] - - [499, 71.9145] + - [503, 71.9145] - - [2368, 2944, 1, 3328] - - [485, 9314.58] + - [489, 9314.58] - - [448, 448, 1, 1280] - - [469, 5129.81] + - [473, 5129.81] - - [2368, 3584, 1, 256] - - [487, 8998.7] + - [491, 8998.7] - - [5056, 3584, 1, 1280] - - [488, 9345.07] + - [492, 9345.07] - - [448, 4, 1, 3328] - - [510, 487.237] + - [514, 487.237] - - [1856, 2944, 1, 1280] - - [498, 8438.69] + - [502, 8438.69] - - [3584, 2368, 1, 1280] - - [493, 9298.8] + - [497, 9298.8] - - [128, 1024, 1, 256] - - [445, 2356.35] + - [449, 2356.35] - - [2944, 1408, 1, 256] - - [485, 5440.72] + - [489, 5440.72] - - [4288, 1408, 1, 3328] - - [485, 9385.99] + - [489, 9385.99] - - [3584, 64, 1, 3328] - - [441, 6310.87] + - [445, 6310.87] - - [1408, 128, 1, 256] - - [487, 2942.43] + - [491, 2942.43] - - [2944, 1024, 1, 128] - - [482, 3927.89] + - [486, 3927.89] - - [4288, 5056, 1, 1280] - - [489, 8328.48] + - [493, 8328.48] - - [5888, 6784, 1, 1280] - - [498, 9757.34] + - [502, 9757.34] - - [6784, 5056, 1, 128] - - [478, 5101.3] + - [482, 5101.3] - - [256, 1024, 1, 3328] - - [487, 6475.77] + - [491, 6475.77] - - [3584, 4, 1, 256] - - [503, 420.873] + - [507, 420.873] - - [1856, 64, 1, 3328] - - [469, 6409.1] + - [473, 6409.1] - - [64, 6784, 1, 128] - - [417, 2387.22] + - [421, 2387.22] - - [5888, 1408, 1, 3328] - - [492, 9655.79] + - [496, 9655.79] - - [5888, 64, 1, 1280] - - [487, 5870.76] + - [491, 5870.76] - - [256, 5056, 1, 256] - - [490, 6108.96] + - [494, 6108.96] - - [128, 3584, 1, 128] - - [420, 2383.13] + - [424, 2383.13] - - [448, 3584, 1, 3328] - - [485, 7092.18] + - [489, 7092.18] - - [704, 2368, 1, 128] - - [479, 3740.98] + - [483, 3740.98] - - [5888, 256, 1, 128] - - [480, 2977.44] + - [484, 2977.44] - - [4, 5056, 1, 128] - - [499, 132.62] + - [503, 132.62] - - [448, 256, 1, 256] - - [451, 2308.19] + - [455, 2308.19] - - [704, 4, 1, 3328] - - [456, 552.574] + - [460, 552.574] - - [1408, 256, 1, 256] - - [487, 4577.12] + - [491, 4577.12] - - [3584, 1856, 1, 128] - - [479, 4571.76] + - [483, 4571.76] - - [4288, 4288, 1, 128] - - [482, 5284.55] + - [486, 5284.55] - - [1856, 1024, 1, 3328] - - [493, 6362.15] + - [497, 6362.15] - - [128, 5888, 1, 3328] - - [463, 7040.73] + - [467, 7040.73] - - [1024, 5056, 1, 256] - - [498, 7855.6] + - [502, 7855.6] - - [2368, 1408, 1, 3328] - - [493, 9205.56] + - [497, 9205.56] - - [5888, 448, 1, 256] - - [490, 5538.74] + - [494, 5538.74] - - [5888, 6784, 1, 128] - - [478, 4500.75] + - [482, 4500.75] - - [2368, 4, 1, 3328] - - [456, 642.798] + - [460, 642.798] - - [6784, 5056, 1, 1280] - - [494, 9249.13] + - [498, 9249.13] - - [5056, 704, 1, 1280] - - [493, 8883.27] + - [497, 8883.27] - - [1408, 256, 1, 1280] - - [487, 5632.0] + - [491, 5632.0] - - [4288, 6784, 1, 1280] - - [493, 8843.21] + - [497, 8843.21] - - [128, 704, 1, 256] - - [451, 2045.09] + - [455, 2045.09] - - [448, 128, 1, 1280] - - [461, 3807.07] + - [465, 3807.07] - - [6784, 4, 1, 3328] - - [504, 684.571] + - [508, 684.571] - - [4288, 4, 1, 1280] - - [453, 601.825] + - [457, 601.825] - - [1024, 64, 1, 3328] - - [465, 3928.38] + - [469, 3928.38] - - [1856, 4, 1, 256] - - [503, 293.294] + - [507, 293.294] - - [64, 3584, 1, 1280] - - [487, 5265.45] + - [491, 5265.45] - - [6784, 1408, 1, 256] - - [487, 9059.26] + - [491, 9059.26] - - [3584, 5888, 1, 128] - - [479, 5084.19] + - [483, 5084.19] - - [5056, 5888, 1, 256] - - [498, 8589.99] + - [502, 8589.99] - - [2368, 1024, 1, 256] - - [490, 4493.03] + - [494, 4493.03] - - [2944, 1856, 1, 256] - - [496, 5202.31] + - [500, 5202.31] - - [1856, 6784, 1, 1280] - - [494, 9071.38] + - [498, 9071.38] - - [64, 5056, 1, 128] - - [417, 2038.32] + - [421, 2038.32] - - [5888, 64, 1, 128] - - [416, 2016.49] + - [420, 2016.49] - - [448, 704, 1, 128] - - [480, 1173.55] + - [484, 1173.55] - - [4, 1024, 1, 128] - - [499, 8.79685] + - [503, 8.79685] - - [4288, 3584, 1, 256] - - [493, 9080.16] + - [497, 9080.16] - - [1408, 704, 1, 128] - - [479, 3165.61] + - [483, 3165.61] - - [64, 256, 1, 3328] - - [473, 3126.49] + - [477, 3126.49] - - [5056, 1856, 1, 1280] - - [490, 8857.45] + - [494, 8857.45] - - [1408, 1024, 1, 3328] - - [496, 8177.02] + - [500, 8177.02] - - [2368, 256, 1, 3328] - - [487, 6810.21] + - [491, 6810.21] - - [5888, 3584, 1, 1280] - - [485, 9535.45] + - [489, 9535.45] - - [1856, 3584, 1, 3328] - - [487, 9281.81] + - [491, 9281.81] - - [5888, 128, 1, 1280] - - [493, 8136.72] + - [497, 8136.72] - - [1024, 2944, 1, 256] - - [485, 7247.86] + - [489, 7247.86] - - [448, 6784, 1, 1280] - - [493, 7013.94] + - [497, 7013.94] - - [256, 3584, 1, 1280] - - [487, 7738.54] + - [491, 7738.54] - - [448, 128, 1, 128] - - [417, 495.948] + - [421, 495.948] - - [704, 5056, 1, 256] - - [493, 8609.34] + - [497, 8609.34] - - [3584, 1024, 1, 3328] - - [486, 7765.63] + - [490, 7765.63] - - [2944, 1856, 1, 1280] - - [498, 7775.93] + - [502, 7775.93] - - [128, 256, 1, 128] - - [430, 296.208] + - [434, 296.208] - - [5056, 256, 1, 256] - - [487, 7829.63] + - [491, 7829.63] - - [2368, 3584, 1, 3328] - - [486, 8895.98] + - [490, 8895.98] - - [2944, 704, 1, 1280] - - [496, 6855.73] + - [500, 6855.73] - - [128, 4, 1, 256] - - [505, 24.8242] + - [509, 24.8242] - - [2944, 3584, 1, 1280] - - [498, 9049.12] + - [502, 9049.12] - - [1856, 5888, 1, 1280] - - [493, 9431.96] + - [497, 9431.96] - - [256, 256, 1, 1280] - - [458, 3942.02] + - [462, 3942.02] - - [5056, 448, 1, 3328] - - [498, 4587.73] + - [502, 4587.73] - - [4288, 1408, 1, 256] - - [498, 5408.73] + - [502, 5408.73] - - [3584, 64, 1, 256] - - [467, 2496.61] + - [471, 2496.61] - - [64, 1856, 1, 3328] - - [440, 5896.68] + - [444, 5896.68] - - [256, 1408, 1, 128] - - [479, 1643.07] + - [483, 1643.07] - - [5888, 1408, 1, 128] - - [478, 4436.27] + - [482, 4436.27] - - [4288, 2368, 1, 1280] - - [487, 9432.94] + - [491, 9432.94] - - [4, 4288, 1, 256] - - [502, 442.632] + - [506, 442.632] - - [256, 4288, 1, 128] - - [479, 2814.69] + - [483, 2814.69] - - [256, 128, 1, 3328] - - [468, 3951.16] + - [472, 3951.16] - - [6784, 2368, 1, 256] - - [487, 9169.89] + - [491, 9169.89] - - [5888, 128, 1, 128] - - [416, 3156.71] + - [420, 3156.71] - - [4288, 1856, 1, 256] - - [493, 5658.13] + - [497, 5658.13] - - [1856, 256, 1, 3328] - - [487, 7646.27] + - [491, 7646.27] - - [1856, 2944, 1, 256] - - [494, 6444.88] + - [498, 6444.88] - - [5056, 1024, 1, 128] - - [478, 4607.2] + - [482, 4607.2] - - [64, 5888, 1, 1280] - - [493, 5842.36] + - [497, 5842.36] - - [1760, 7133, 1, 1760] - - [486, 9097.74] + - [490, 9097.74] - - [6784, 256, 1, 128] - - [479, 3685.31] + - [483, 3685.31] - - [5888, 704, 1, 128] - - [478, 3656.13] + - [482, 3656.13] - - [6784, 64, 1, 128] - - [429, 2191.42] + - [433, 2191.42] - - [1024, 4288, 1, 1280] - - [493, 9199.22] + - [497, 9199.22] - - [2368, 5056, 1, 3328] - - [489, 9072.78] + - [493, 9072.78] - - [448, 4, 1, 128] - - [500, 5.32937] + - [504, 5.32937] - - [4, 256, 1, 3328] - - [510, 310.937] + - [514, 310.937] - - [4288, 1024, 1, 3328] - - [491, 8660.23] + - [495, 8660.23] - - [1024, 5056, 1, 3328] - - [487, 8886.66] + - [491, 8886.66] - - [1024, 1856, 1, 3328] - - [492, 8426.14] + - [496, 8426.14] - - [704, 704, 1, 1280] - - [487, 7661.7] + - [491, 7661.7] - - [128, 2368, 1, 1280] - - [461, 5746.05] + - [465, 5746.05] - - [1408, 128, 1, 3328] - - [469, 6530.77] + - [473, 6530.77] - - [3584, 256, 1, 1280] - - [493, 7633.94] + - [497, 7633.94] - - [4, 128, 1, 128] - - [500, 1.97874] + - [504, 1.97874] - - [704, 6784, 1, 128] - - [482, 4589.49] + - [486, 4589.49] - - [3584, 128, 1, 1280] - - [487, 7078.14] + - [491, 7078.14] - - [4, 256, 1, 1280] - - [456, 178.087] + - [460, 178.087] - - [128, 704, 1, 3328] - - [461, 5959.71] + - [465, 5959.71] - - [4288, 6784, 1, 256] - - [487, 9326.44] + - [491, 9326.44] - - [3584, 2944, 1, 3328] - - [489, 9114.06] + - [493, 9114.06] - - [128, 1856, 1, 256] - - [493, 3672.55] + - [497, 3672.55] - - [64, 4288, 1, 256] - - [487, 3457.41] + - [491, 3457.41] - - [4, 3584, 1, 3328] - - [436, 694.27] + - [440, 694.27] - - [64, 4, 1, 3328] - - [456, 71.4738] + - [460, 71.4738] - - [4, 64, 1, 3328] - - [456, 91.8069] + - [460, 91.8069] - - [5888, 2944, 1, 256] - - [486, 7241.45] + - [490, 7241.45] - - [2368, 6784, 1, 128] - - [482, 5229.53] + - [486, 5229.53] - - [448, 4288, 1, 1280] - - [487, 8416.3] + - [491, 8416.3] - - [448, 1856, 1, 3328] - - [487, 7161.46] + - [491, 7161.46] - - [4, 1024, 1, 256] - - [453, 187.246] + - [457, 187.246] - - [5056, 4288, 1, 256] - - [498, 8947.16] + - [502, 8947.16] - - [1024, 448, 1, 256] - - [493, 5318.86] + - [497, 5318.86] - - [1024, 3584, 1, 256] - - [488, 6151.94] + - [492, 6151.94] - - [2944, 128, 1, 1280] - - [469, 6053.53] + - [473, 6053.53] - - [1856, 5056, 1, 128] - - [479, 5091.32] + - [483, 5091.32] - - [64, 256, 1, 256] - - [442, 771.012] + - [446, 771.012] - - [1408, 4, 1, 128] - - [499, 40.7758] + - [503, 40.7758] - - [128, 2368, 1, 128] - - [427, 1520.27] + - [431, 1520.27] - - [256, 704, 1, 1280] - - [487, 4329.71] + - [491, 4329.71] - - [64, 2368, 1, 128] - - [418, 1212.42] + - [422, 1212.42] - - [6784, 6784, 1, 3328] - - [498, 8310.57] + - [502, 8310.57] - - [448, 5888, 1, 1280] - - [493, 8502.23] + - [497, 8502.23] - - [5056, 448, 1, 128] - - [479, 4160.9] + - [483, 4160.9] - - [3584, 2944, 1, 128] - - [479, 4363.41] + - [483, 4363.41] - - [6784, 256, 1, 1280] - - [493, 8629.57] + - [497, 8629.57] - - [256, 2944, 1, 1280] - - [493, 7277.38] + - [497, 7277.38] - - [64, 4288, 1, 128] - - [418, 1821.96] + - [422, 1821.96] - - [2368, 5888, 1, 3328] - - [487, 9017.42] + - [491, 9017.42] - - [4, 64, 1, 256] - - [453, 16.0627] + - [457, 16.0627] - - [704, 1024, 1, 3328] - - [493, 8059.45] + - [497, 8059.45] - - [2368, 1856, 1, 1280] - - [493, 8813.14] + - [497, 8813.14] - - [128, 448, 1, 128] - - [414, 588.144] + - [418, 588.144] - - [128, 6784, 1, 256] - - [493, 6538.18] + - [497, 6538.18] - - [3584, 4288, 1, 128] - - [479, 5025.36] + - [483, 5025.36] - - [64, 448, 1, 128] - - [431, 231.693] + - [435, 231.693] - - [5888, 4288, 1, 3328] - - [487, 9515.78] + - [491, 9515.78] - - [2368, 704, 1, 256] - - [493, 7642.74] + - [497, 7642.74] - - [256, 1856, 1, 3328] - - [493, 6547.07] + - [497, 6547.07] - - [1856, 128, 1, 256] - - [487, 3782.18] + - [491, 3782.18] - - [6784, 128, 1, 128] - - [421, 2835.44] + - [425, 2835.44] - - [3584, 1408, 1, 128] - - [478, 3049.11] + - [482, 3049.11] - - [1856, 5056, 1, 1280] - - [494, 8863.2] + - [498, 8863.2] - - [2944, 1024, 1, 1280] - - [498, 8873.15] + - [502, 8873.15] - - [5056, 4, 1, 256] - - [434, 494.021] + - [438, 494.021] - - [3584, 5888, 1, 3328] - - [486, 9585.15] + - [490, 9585.15] - - [2368, 4288, 1, 256] - - [498, 6418.95] + - [502, 6418.95] - - [1024, 2368, 1, 3328] - - [493, 8645.26] + - [497, 8645.26] - - [64, 704, 1, 3328] - - [475, 4399.83] + - [479, 4399.83] - - [704, 1408, 1, 256] - - [487, 7428.44] + - [491, 7428.44] - - [6784, 1856, 1, 3328] - - [498, 9163.56] + - [502, 9163.56] - - [1024, 2944, 1, 128] - - [482, 3551.88] + - [486, 3551.88] - - [1024, 3584, 1, 1280] - - [496, 9112.37] + - [500, 9112.37] - - [4288, 5888, 1, 3328] - - [486, 8523.95] + - [490, 8523.95] - - [4288, 4, 1, 3328] - - [453, 619.916] + - [457, 619.916] - - [256, 1408, 1, 256] - - [487, 4505.6] + - [491, 4505.6] - - [448, 2944, 1, 1280] - - [487, 7612.77] + - [491, 7612.77] - - [4, 5888, 1, 128] - - [499, 174.464] + - [503, 174.464] - - [1024, 2944, 1, 3328] - - [492, 9136.64] + - [496, 9136.64] - - [3584, 6784, 1, 256] - - [492, 7253.79] + - [496, 7253.79] - - [256, 6784, 1, 1280] - - [487, 8637.62] + - [491, 8637.62] - - [1856, 3584, 1, 256] - - [493, 8199.57] + - [497, 8199.57] - - [128, 448, 1, 3328] - - [474, 4799.82] + - [478, 4799.82] - - [6784, 1856, 1, 128] - - [479, 5185.52] + - [483, 5185.52] - - [4, 448, 1, 256] - - [453, 86.8848] + - [457, 86.8848] - - [2944, 704, 1, 128] - - [482, 3798.54] + - [486, 3798.54] - - [256, 5888, 1, 1280] - - [487, 8678.37] + - [491, 8678.37] - - [4, 128, 1, 1280] - - [456, 102.4] + - [460, 102.4] - - [4288, 6784, 1, 3328] - - [492, 8209.3] + - [496, 8209.3] - - [6784, 128, 1, 1280] - - [469, 6562.89] + - [473, 6562.89] - - [64, 1408, 1, 256] - - [459, 2059.7] + - [463, 2059.7] - - [7680, 5481, 1, 2560] - - [498, 9426.69] + - [502, 9426.69] - - [2368, 1408, 1, 128] - - [479, 4532.4] + - [483, 4532.4] - - [1856, 448, 1, 256] - - [487, 6275.38] + - [491, 6275.38] - - [1408, 1024, 1, 128] - - [479, 3604.48] + - [483, 3604.48] - - [128, 64, 1, 128] - - [414, 87.3813] + - [418, 87.3813] - - [6784, 3584, 1, 3328] - - [494, 8991.82] + - [498, 8991.82] - - [2944, 64, 1, 3328] - - [463, 6043.26] + - [467, 6043.26] - - [64, 64, 1, 128] - - [419, 36.209] + - [423, 36.209] - - [2368, 5056, 1, 1280] - - [493, 9438.38] + - [497, 9438.38] - - [64, 4, 1, 1280] - - [456, 40.1569] + - [460, 40.1569] - - [1408, 2368, 1, 1280] - - [489, 7738.06] + - [493, 7738.06] - - [128, 1408, 1, 1280] - - [461, 4937.64] + - [465, 4937.64] - - [256, 64, 1, 3328] - - [471, 2683.36] + - [475, 2683.36] - - [2944, 4288, 1, 128] - - [479, 5173.71] + - [483, 5173.71] - - [2944, 2944, 1, 256] - - [487, 8943.82] + - [491, 8943.82] - - [2944, 4, 1, 1280] - - [436, 617.757] + - [440, 617.757] - - [5888, 4, 1, 256] - - [502, 483.118] + - [506, 483.118] - - [6784, 256, 1, 256] - - [493, 7916.6] + - [497, 7916.6] - - [256, 5056, 1, 3328] - - [487, 8953.15] + - [491, 8953.15] - - [128, 4288, 1, 1280] - - [440, 6014.95] + - [444, 6014.95] - - [5056, 1856, 1, 128] - - [481, 4221.05] + - [485, 4221.05] - - [5888, 1408, 1, 256] - - [492, 9144.75] + - [496, 9144.75] - - [128, 128, 1, 256] - - [442, 759.838] + - [446, 759.838] - - [5056, 4, 1, 3328] - - [502, 642.718] + - [506, 642.718] - - [4288, 3584, 1, 3328] - - [488, 9299.95] + - [492, 9299.95] - - [448, 704, 1, 3328] - - [494, 4480.98] + - [498, 4480.98] - - [448, 448, 1, 128] - - [418, 1360.71] + - [422, 1360.71] - - [1024, 2368, 1, 1280] - - [487, 8570.19] + - [491, 8570.19] - - [1856, 704, 1, 3328] - - [487, 8448.16] + - [491, 8448.16] - - [4, 2368, 1, 128] - - [499, 64.4902] + - [503, 64.4902] - - [5888, 6784, 1, 3328] - - [494, 9447.02] + - [498, 9447.02] - - [704, 4288, 1, 1280] - - [496, 7476.77] + - [500, 7476.77] - - [704, 256, 1, 256] - - [487, 2957.52] + - [491, 2957.52] - - [6784, 448, 1, 3328] - - [490, 8886.12] + - [494, 8886.12] - - [4288, 1024, 1, 128] - - [478, 3864.39] + - [482, 3864.39] - - [49, 512, 128, 2048] - - [521, 7112.68] + - [525, 7112.68] - - [196, 256, 256, 1024] - - [515, 8302.6] + - [519, 8302.6] - - [784, 512, 256, 128] - - [513, 9061.26] + - [517, 9061.26] - - [49, 2048, 128, 512] - - [511, 6963.26] + - [515, 6963.26] - - [784, 128, 128, 512] - - [520, 8983.53] + - [524, 8983.53] - - [3136, 256, 256, 64] - - [516, 9051.28] + - [520, 9051.28] - - [3136, 64, 128, 64] - - [512, 8581.25] + - [516, 8581.25] - - [49, 2048, 256, 512] - - [511, 7049.54] + - [515, 7049.54] - - [784, 128, 256, 512] - - [522, 9102.89] + - [526, 9102.89] - - [196, 256, 128, 1024] - - [514, 8085.79] + - [518, 8085.79] - - [3136, 64, 128, 256] - - [518, 9381.29] + - [522, 9381.29] - - [3136, 256, 128, 64] - - [516, 8982.54] + - [520, 8982.54] - - [784, 512, 128, 128] - - [513, 8965.89] + - [517, 8965.89] - - [3136, 64, 256, 256] - - [518, 9566.33] + - [522, 9566.33] - - [3136, 64, 256, 64] - - [512, 8743.7] + - [516, 8743.7] - - [196, 1024, 128, 256] - - [515, 8119.33] + - [519, 8119.33] - - [49, 512, 256, 2048] - - [524, 7166.31] + - [528, 7166.31] - - [196, 1024, 256, 256] - - [515, 8210.56] + - [519, 8210.56] - - [5329, 160, 64, 64] - - [531, 8156.79] + - [535, 8156.79] - - [1225, 384, 64, 192] - - [528, 9162.25] + - [532, 9162.25] - - [289, 1024, 64, 256] - - [528, 8483.73] + - [532, 8483.73] - - [64, 1536, 64, 384] - - [538, 9323.55] + - [542, 9323.55] - - [1225, 384, 64, 64] - - [537, 8158.7] + - [541, 8158.7] - - [1225, 384, 64, 96] - - [528, 8540.6] + - [532, 8540.6] - - [64, 1536, 64, 256] - - [534, 9142.9] + - [538, 9142.9] - - [289, 1024, 64, 384] - - [526, 8725.56] + - [530, 8725.56] - - [289, 1024, 64, 192] - - [528, 8313.06] + - [532, 8313.06] - - [289, 1024, 64, 128] - - [534, 7989.41] + - [538, 7989.41] - - [4096, 1024, 1, 2984] - - [573, 9846.29] + - [577, 9846.29] - - [1024, 4096, 1, 3437] - - [574, 9915.7] + - [578, 9915.7] - - [1024, 4096, 1, 3235] - - [567, 9913.92] + - [571, 9913.92] - - [4096, 1024, 1, 4032] - - [573, 9925.96] + - [577, 9925.96] - - [1024, 4096, 1, 3334] - - [574, 9918.17] + - [578, 9918.17] - - [4096, 1024, 1, 3288] - - [574, 9854.57] + - [578, 9854.57] - - [1024, 4096, 1, 3515] - - [574, 9923.93] + - [578, 9923.93] - - [4096, 1024, 1, 3437] - - [574, 9869.53] + - [578, 9869.53] - - [1024, 4096, 1, 3259] - - [574, 9907.55] + - [578, 9907.55] - - [1024, 4096, 1, 3384] - - [566, 9921.11] + - [570, 9921.11] - - [64, 92, 688, 92] - - [544, 6137.79] + - [548, 6137.79] - - [4096, 1024, 1, 3458] - - [573, 9887.59] + - [577, 9887.59] - - [1024, 4096, 1, 3412] - - [573, 9930.46] + - [577, 9930.46] - - [1024, 4096, 1, 3529] - - [567, 9924.44] + - [571, 9924.44] - - [1024, 4096, 1, 4032] - - [574, 9963.38] + - [578, 9963.38] - - [4096, 1024, 1, 3999] - - [574, 9894.9] + - [578, 9894.9] - - [1024, 4096, 1, 3079] - - [567, 9894.48] + - [571, 9894.48] - - [1024, 4096, 1, 3876] - - [566, 9949.29] + - [570, 9949.29] - - [1024, 4096, 1, 3450] - - [574, 9915.55] + - [578, 9915.55] - - [1024, 4096, 1, 3256] - - [574, 9911.08] + - [578, 9911.08] - - [4096, 1024, 1, 3403] - - [573, 9858.83] + - [577, 9858.83] - - [1024, 1024, 1, 3975] - - [564, 8990.71] + - [568, 8990.71] - - [1024, 4096, 1, 3359] - - [574, 9914.9] + - [578, 9914.9] - - [4096, 1024, 1, 3549] - - [573, 9870.56] + - [577, 9870.56] - - [4096, 1024, 1, 3176] - - [574, 9855.82] + - [578, 9855.82] - - [1024, 4096, 1, 3504] - - [566, 9934.07] + - [570, 9934.07] - - [4096, 1024, 1, 3314] - - [573, 9873.8] + - [577, 9873.8] - - [4096, 1024, 1, 3183] - - [573, 9843.74] + - [577, 9843.74] - - [1024, 4096, 1, 3209] - - [567, 9904.87] + - [571, 9904.87] - - [1024, 4096, 1, 3720] - - [566, 9934.06] + - [570, 9934.06] - - [1024, 4096, 1, 3859] - - [566, 9952.43] + - [570, 9952.43] - - [1024, 33708, 1, 4059] - - [566, 10321.4] + - [570, 10321.4] - - [1024, 4096, 1, 3968] - - [566, 9955.86] + - [570, 9955.86] - - [64, 123, 528, 123] - - [539, 6916.11] + - [543, 6916.11] - - [4096, 1024, 1, 3477] - - [574, 9871.93] + - [578, 9871.93] - - [4096, 1024, 1, 3233] - - [574, 9862.25] + - [578, 9862.25] - - [4096, 1024, 1, 3409] - - [574, 9876.76] + - [578, 9876.76] - - [4096, 1024, 1, 3564] - - [574, 9870.39] + - [578, 9870.39] - - [64, 102, 624, 100] - - [539, 5773.06] + - [543, 5773.06] - - [4096, 1024, 1, 3190] - - [573, 9850.87] + - [577, 9850.87] - - [64, 112, 576, 111] - - [539, 6517.25] + - [543, 6517.25] - - [1024, 4096, 1, 3288] - - [573, 9911.8] + - [577, 9911.8] - - [4096, 1024, 1, 3451] - - [573, 9859.51] + - [577, 9859.51] - - [1024, 4096, 1, 3348] - - [566, 9915.37] + - [570, 9915.37] - - [64, 102, 624, 102] - - [539, 5783.6] + - [543, 5783.6] - - [1024, 4096, 1, 3465] - - [567, 9913.02] + - [571, 9913.02] - - [1024, 33708, 1, 4032] - - [566, 10340.3] + - [570, 10340.3] - - [1024, 33708, 1, 3840] - - [566, 10341.7] + - [570, 10341.7] - - [4096, 1024, 1, 3391] - - [574, 9861.67] + - [578, 9861.67] - - [1024, 4096, 1, 3530] - - [566, 9920.34] + - [570, 9920.34] - - [4096, 1024, 1, 3209] - - [573, 9846.9] + - [577, 9846.9] - - [1024, 4096, 1, 3457] - - [567, 9917.19] + - [571, 9917.19] - - [1024, 4096, 1, 3386] - - [566, 9917.55] + - [570, 9917.55] - - [4096, 1024, 1, 3350] - - [573, 9884.44] + - [577, 9884.44] - - [1024, 4096, 1, 3184] - - [574, 9925.88] + - [578, 9925.88] - - [1024, 4096, 1, 3093] - - [573, 9902.45] + - [577, 9902.45] - - [64, 133, 480, 135] - - [556, 6205.87] + - [560, 6205.87] - - [1024, 4096, 1, 3400] - - [566, 9917.0] + - [570, 9917.0] - - [1024, 1024, 1, 4026] - - [572, 9014.29] + - [576, 9014.29] - - [1024, 4096, 1, 3214] - - [566, 9895.84] + - [570, 9895.84] - - [4096, 1024, 1, 3406] - - [574, 9857.72] + - [578, 9857.72] - - [1024, 4096, 1, 3565] - - [573, 9919.27] + - [577, 9919.27] - - [4096, 1024, 1, 3536] - - [574, 9888.96] + - [578, 9888.96] - - [1024, 4096, 1, 3183] - - [573, 9907.45] + - [577, 9907.45] - - [1024, 4096, 1, 3462] - - [574, 9922.3] + - [578, 9922.3] - - [4096, 1024, 1, 3130] - - [567, 9845.94] + - [571, 9845.94] - - [4096, 1024, 1, 3381] - - [574, 9868.17] + - [578, 9868.17] - - [4096, 1024, 1, 3298] - - [573, 9870.44] + - [577, 9870.44] - - [1024, 4096, 1, 3292] - - [566, 9906.2] + - [570, 9906.2] - - [4096, 1024, 1, 3289] - - [573, 9856.45] + - [577, 9856.45] - - [64, 160, 400, 159] - - [559, 7427.74] + - [563, 7427.74] - - [1024, 4096, 1, 3379] - - [566, 9916.99] + - [570, 9916.99] - - [1024, 4096, 1, 3990] - - [567, 9947.27] + - [571, 9947.27] - - [1024, 4096, 1, 3540] - - [574, 9935.66] + - [578, 9935.66] - - [4096, 1024, 1, 3412] - - [574, 9867.46] + - [578, 9867.46] - - [1024, 1024, 1, 3780] - - [569, 9036.16] + - [573, 9036.16] - - [1024, 4096, 1, 3555] - - [573, 9927.27] + - [577, 9927.27] - - [1024, 4096, 1, 3518] - - [567, 9925.45] + - [571, 9925.45] - - [4096, 1024, 1, 3189] - - [573, 9861.14] + - [577, 9861.14] - - [1024, 4096, 1, 3298] - - [567, 9923.12] + - [571, 9923.12] - - [4096, 1024, 1, 3072] - - [573, 9871.98] + - [577, 9871.98] - - [1024, 4096, 1, 3393] - - [574, 9929.18] + - [578, 9929.18] - - [1024, 4096, 1, 3207] - - [566, 9912.71] + - [570, 9912.71] - - [64, 228, 272, 232] - - [562, 7350.04] + - [566, 7350.04] - - [64, 23, 2720, 23] - - [543, 2640.15] + - [547, 2640.15] - - [4096, 1024, 1, 3487] - - [574, 9860.81] + - [578, 9860.81] - - [1024, 1024, 1, 3822] - - [572, 8993.86] + - [576, 8993.86] - - [64, 77, 816, 77] - - [544, 5273.09] + - [548, 5273.09] - - [4096, 1024, 1, 3431] - - [574, 9867.43] + - [578, 9867.43] - - [4096, 1024, 1, 3378] - - [573, 9888.04] + - [577, 9888.04] - - [4096, 1024, 1, 3529] - - [567, 9879.4] + - [571, 9879.4] - - [4096, 1024, 1, 3460] - - [574, 9877.15] + - [578, 9877.15] - - [1024, 4096, 1, 3336] - - [566, 9912.31] + - [570, 9912.31] - - [1024, 4096, 1, 3501] - - [567, 9914.3] + - [571, 9914.3] - - [64, 159, 400, 159] - - [557, 7016.41] + - [561, 7016.41] - - [1024, 4096, 1, 3584] - - [574, 9940.49] + - [578, 9940.49] - - [64, 135, 480, 134] - - [557, 6241.29] + - [561, 6241.29] - - [64, 99, 624, 99] - - [548, 5617.29] + - [552, 5617.29] - - [4096, 1024, 1, 2499] - - [573, 9813.47] + - [577, 9813.47] - - [1024, 1024, 1, 3942] - - [569, 9059.91] + - [573, 9059.91] - - [4096, 1024, 1, 3352] - - [573, 9867.02] + - [577, 9867.02] - - [1024, 4096, 1, 3543] - - [574, 9928.67] + - [578, 9928.67] - - [1024, 4096, 1, 3476] - - [573, 9931.48] + - [577, 9931.48] - - [1024, 33708, 1, 3822] - - [566, 10324.6] + - [570, 10324.6] - - [1024, 4096, 1, 3436] - - [566, 9917.18] + - [570, 9917.18] - - [1024, 1024, 1, 3861] - - [565, 8998.39] + - [569, 8998.39] - - [1024, 1024, 1, 4000] - - [570, 9058.2] + - [574, 9058.2] - - [1024, 4096, 1, 3594] - - [566, 9927.78] + - [570, 9927.78] - - [4096, 1024, 1, 3514] - - [574, 9872.2] + - [578, 9872.2] - - [1024, 4096, 1, 3064] - - [573, 9907.0] + - [577, 9907.0] - - [4096, 1024, 1, 3371] - - [566, 9857.64] + - [570, 9857.64] - - [4096, 1024, 1, 3558] - - [574, 9876.21] + - [578, 9876.21] - - [4096, 1024, 1, 3517] - - [573, 9866.35] + - [577, 9866.35] - - [4096, 1024, 1, 3144] - - [573, 9846.26] + - [577, 9846.26] - - [1024, 4096, 1, 3312] - - [566, 9932.75] + - [570, 9932.75] - - [4096, 1024, 1, 3079] - - [573, 9851.0] + - [577, 9851.0] - - [1024, 4096, 1, 3415] - - [566, 9919.37] + - [570, 9919.37] - - [1024, 4096, 1, 3221] - - [573, 9908.08] + - [577, 9908.08] - - [1024, 4096, 1, 3978] - - [567, 9944.31] + - [571, 9944.31] - - [4096, 1024, 1, 3876] - - [573, 9898.89] + - [577, 9898.89] - - [1024, 4096, 1, 3528] - - [566, 9919.5] + - [570, 9919.5] - - [1024, 4096, 1, 3181] - - [574, 9894.76] + - [578, 9894.76] - - [4096, 1024, 1, 3445] - - [573, 9878.44] + - [577, 9878.44] - - [4096, 1024, 1, 3450] - - [566, 9864.72] + - [570, 9864.72] - - [4096, 1024, 1, 3377] - - [573, 9879.59] + - [577, 9879.59] - - [1024, 4096, 1, 3532] - - [567, 9928.09] + - [571, 9928.09] - - [1024, 33708, 1, 3944] - - [566, 10329.6] + - [570, 10329.6] - - [4096, 1024, 1, 3483] - - [573, 9861.73] + - [577, 9861.73] - - [1024, 4096, 1, 3358] - - [566, 9903.59] + - [570, 9903.59] - - [4096, 1024, 1, 3464] - - [573, 9876.74] + - [577, 9876.74] - - [4096, 1024, 1, 3282] - - [566, 9859.13] + - [570, 9859.13] - - [4096, 1024, 1, 3256] - - [574, 9855.0] + - [578, 9855.0] - - [1024, 4096, 1, 3057] - - [573, 9910.65] + - [577, 9910.65] - - [4096, 1024, 1, 3481] - - [573, 9866.19] + - [577, 9866.19] - - [4096, 1024, 1, 3340] - - [573, 9862.15] + - [577, 9862.15] - - [1024, 1024, 1, 3870] - - [572, 9082.35] + - [576, 9082.35] - - [1024, 4096, 1, 3273] - - [566, 9916.19] + - [570, 9916.19] - - [64, 65, 992, 65] - - [557, 4682.91] + - [561, 4682.91] - - [4096, 1024, 1, 3392] - - [567, 9881.02] + - [571, 9881.02] - - [4096, 1024, 1, 3337] - - [573, 9864.4] + - [577, 9864.4] - - [4096, 1024, 1, 3359] - - [573, 9874.32] + - [577, 9874.32] - - [4096, 1024, 1, 3498] - - [574, 9864.25] + - [578, 9864.25] - - [4096, 1024, 1, 3169] - - [573, 9851.0] + - [577, 9851.0] - - [1024, 33708, 1, 3859] - - [567, 10332.5] + - [571, 10332.5] - - [64, 19, 3264, 19] - - [543, 2182.04] + - [547, 2182.04] - - [1024, 4096, 1, 3103] - - [566, 9898.8] + - [570, 9898.8] - - [4096, 1024, 1, 3900] - - [573, 9897.02] + - [577, 9897.02] - - [1024, 4096, 1, 3442] - - [573, 9938.87] + - [577, 9938.87] - - [1024, 4096, 1, 3248] - - [573, 9939.82] + - [577, 9939.82] - - [1024, 4096, 1, 3351] - - [574, 9923.13] + - [578, 9923.13] - - [4096, 1024, 1, 3593] - - [573, 9894.26] + - [577, 9894.26] - - [1024, 4096, 1, 3780] - - [573, 9941.86] + - [577, 9941.86] - - [64, 133, 480, 133] - - [557, 6180.69] + - [561, 6180.69] - - [1024, 33708, 1, 3681] - - [566, 10332.2] + - [570, 10332.2] - - [4096, 1024, 1, 3374] - - [567, 9859.26] + - [571, 9859.26] - - [1024, 4096, 1, 3557] - - [566, 9928.1] + - [570, 9928.1] - - [4096, 1024, 1, 3906] - - [573, 9906.97] + - [577, 9906.97] - - [4096, 1024, 1, 3504] - - [573, 9885.95] + - [577, 9885.95] - - [1024, 4096, 1, 3270] - - [573, 9916.27] + - [577, 9916.27] - - [4096, 1024, 1, 3098] - - [566, 9854.66] + - [570, 9854.66] - - [64, 232, 272, 232] - - [562, 7394.0] + - [566, 7394.0] - - [4096, 1024, 1, 3216] - - [574, 9876.47] + - [578, 9876.47] - - [64, 148, 432, 148] - - [559, 6663.75] + - [563, 6663.75] - - [1024, 4096, 1, 3550] - - [573, 9920.18] + - [577, 9920.18] - - [4096, 1024, 1, 3449] - - [567, 9870.47] + - [571, 9870.47] - - [1024, 4096, 1, 3403] - - [574, 9908.11] + - [578, 9908.11] - - [1024, 4096, 1, 3523] - - [573, 9932.61] + - [577, 9932.61] - - [1024, 4096, 1, 3486] - - [573, 9917.36] + - [577, 9917.36] - - [1024, 4096, 1, 3564] - - [573, 9923.34] + - [577, 9923.34] - - [1024, 33708, 1, 4005] - - [566, 10339.4] + - [570, 10339.4] - - [4096, 1024, 1, 3296] - - [573, 9879.68] + - [577, 9879.68] - - [1024, 4096, 1, 3263] - - [566, 9907.07] + - [570, 9907.07] - - [64, 25, 2512, 25] - - [543, 2848.07] + - [547, 2848.07] - - [1024, 4096, 1, 3130] - - [574, 9900.0] + - [578, 9900.0] - - [1024, 4096, 1, 3295] - - [574, 9895.35] + - [578, 9895.35] - - [1024, 33708, 1, 3925] - - [567, 10342.2] + - [571, 10342.2] - - [1024, 4096, 1, 3378] - - [566, 9921.27] + - [570, 9921.27] - - [4096, 1024, 1, 3720] - - [574, 9885.72] + - [578, 9885.72] - - [4096, 1024, 1, 3399] - - [573, 9880.55] + - [577, 9880.55] - - [4096, 1024, 1, 3543] - - [574, 9870.63] + - [578, 9870.63] - - [64, 9, 6544, 9] - - [546, 955.07] + - [550, 955.07] - - [4096, 1024, 1, 3497] - - [573, 9868.33] + - [577, 9868.33] - - [4096, 1024, 1, 3594] - - [574, 9876.78] + - [578, 9876.78] - - [1024, 4096, 1, 3144] - - [574, 9901.86] + - [578, 9901.86] - - [1024, 4096, 1, 3975] - - [567, 9950.09] + - [571, 9950.09] - - [4096, 1024, 1, 3205] - - [574, 9855.97] + - [578, 9855.97] - - [1024, 33708, 1, 3995] - - [566, 10331.0] + - [570, 10331.0] - - [1024, 4096, 1, 3392] - - [566, 9935.68] + - [570, 9935.68] - - [1024, 4096, 1, 3055] - - [574, 9893.15] + - [578, 9893.15] - - [1024, 4096, 1, 4026] - - [574, 9940.12] + - [578, 9940.12] - - [4096, 1024, 1, 3557] - - [573, 9883.9] + - [577, 9883.9] - - [4096, 1024, 1, 3515] - - [573, 9871.84] + - [577, 9871.84] - - [4096, 1024, 1, 3486] - - [574, 9860.64] + - [578, 9860.64] - - [4096, 1024, 1, 3457] - - [574, 9885.27] + - [578, 9885.27] - - [1024, 4096, 1, 3511] - - [566, 9928.14] + - [570, 9928.14] - - [4096, 1024, 1, 3138] - - [573, 9853.96] + - [577, 9853.96] - - [1024, 4096, 1, 3339] - - [567, 9912.79] + - [571, 9912.79] - - [1024, 4096, 1, 3939] - - [567, 9952.16] + - [571, 9952.16] - - [4096, 1024, 1, 3500] - - [567, 9863.52] + - [571, 9863.52] - - [4096, 1024, 1, 3395] - - [574, 9883.72] + - [578, 9883.72] - - [4096, 1024, 1, 3968] - - [574, 9920.26] + - [578, 9920.26] - - [4096, 1024, 1, 4020] - - [574, 9912.71] + - [578, 9912.71] - - [4096, 1024, 1, 3942] - - [573, 9910.07] + - [577, 9910.07] - - [1024, 1024, 1, 4032] - - [563, 9024.64] + - [567, 9024.64] - - [4096, 1024, 1, 3349] - - [574, 9865.94] + - [578, 9865.94] - - [1024, 4096, 1, 3322] - - [567, 9908.33] + - [571, 9908.33] - - [4096, 1024, 1, 3452] - - [573, 9872.59] + - [577, 9872.59] - - [1024, 4096, 1, 3417] - - [573, 9912.54] + - [577, 9912.54] - - [1024, 1024, 1, 4012] - - [571, 9085.37] + - [575, 9085.37] - - [1024, 4096, 1, 3526] - - [567, 9920.26] + - [571, 9920.26] - - [4096, 1024, 1, 3485] - - [567, 9861.54] + - [571, 9861.54] - - [1024, 1024, 1, 3681] - - [571, 8991.36] + - [575, 8991.36] - - [4096, 1024, 1, 3303] - - [574, 9861.2] + - [578, 9861.2] - - [4096, 1024, 1, 3344] - - [574, 9892.34] + - [578, 9892.34] - - [1024, 4096, 1, 3479] - - [574, 9921.67] + - [578, 9921.67] - - [4096, 1024, 1, 3300] - - [573, 9868.54] + - [577, 9868.54] - - [1024, 4096, 1, 3439] - - [567, 9918.19] + - [571, 9918.19] - - [4096, 1024, 1, 3280] - - [574, 9875.19] + - [578, 9875.19] - - [1024, 4096, 1, 3245] - - [566, 9910.39] + - [570, 9910.39] - - [1024, 4096, 1, 3328] - - [566, 9941.5] + - [570, 9941.5] - - [4096, 1024, 1, 3418] - - [566, 9870.66] + - [570, 9870.66] - - [1024, 4096, 1, 3493] - - [574, 9938.35] + - [578, 9938.35] - - [1024, 4096, 1, 3500] - - [566, 9916.83] + - [570, 9916.83] - - [1024, 4096, 1, 3166] - - [566, 9898.02] + - [570, 9898.02] - - [4096, 1024, 1, 3126] - - [567, 9846.94] + - [571, 9846.94] - - [1024, 4096, 1, 3277] - - [574, 9898.56] + - [578, 9898.56] - - [1024, 4096, 1, 3315] - - [573, 9923.01] + - [577, 9923.01] - - [1024, 1024, 1, 3927] - - [564, 8987.61] + - [568, 8987.61] - - [1024, 4096, 1, 3414] - - [566, 9915.91] + - [570, 9915.91] - - [4096, 1024, 1, 3531] - - [573, 9871.82] + - [577, 9871.82] - - [4096, 1024, 1, 3484] - - [566, 9867.76] + - [570, 9867.76] - - [1024, 4096, 1, 3180] - - [573, 9903.99] + - [577, 9903.99] - - [4096, 1024, 1, 3360] - - [573, 9879.47] + - [577, 9879.47] - - [1024, 33708, 1, 3990] - - [566, 10334.9] + - [570, 10334.9] - - [4096, 1024, 1, 3466] - - [573, 9874.92] + - [577, 9874.92] - - [1024, 4096, 1, 3428] - - [566, 9915.92] + - [570, 9915.92] - - [1024, 4096, 1, 3137] - - [573, 9913.17] + - [577, 9913.17] - - [4096, 1024, 1, 4059] - - [573, 9901.76] + - [577, 9901.76] - - [1024, 4096, 1, 3353] - - [573, 9914.5] + - [577, 9914.5] - - [1024, 4096, 1, 3942] - - [573, 9944.4] + - [577, 9944.4] - - [4096, 1024, 1, 3506] - - [566, 9875.65] + - [570, 9875.65] - - [1024, 1024, 1, 3894] - - [564, 8946.45] + - [568, 8946.45] - - [4096, 1024, 1, 3508] - - [574, 9877.57] + - [578, 9877.57] - - [64, 132, 480, 135] - - [557, 6164.76] + - [561, 6164.76] - - [4096, 1024, 1, 3956] - - [566, 9907.73] + - [570, 9907.73] - - [64, 7, 8192, 7] - - [545, 812.978] + - [549, 812.978] - - [1024, 4096, 1, 3272] - - [567, 9909.72] + - [571, 9909.72] - - [1024, 4096, 1, 3443] - - [574, 9929.73] + - [578, 9929.73] - - [1024, 4096, 1, 3375] - - [574, 9909.13] + - [578, 9909.13] - - [1024, 4096, 1, 3525] - - [574, 9929.17] + - [578, 9929.17] - - [4096, 1024, 1, 3472] - - [573, 9889.87] + - [577, 9889.87] - - [1024, 4096, 1, 3520] - - [566, 9947.69] + - [570, 9947.69] - - [4096, 1024, 1, 3322] - - [573, 9862.88] + - [577, 9862.88] - - [4096, 1024, 1, 3387] - - [573, 9861.52] + - [577, 9861.52] - - [64, 8, 7280, 8] - - [551, 1024.0] + - [555, 1024.0] - - [1024, 33708, 1, 3939] - - [566, 10339.8] + - [570, 10339.8] - - [4096, 1024, 1, 3345] - - [574, 9873.58] + - [578, 9873.58] - - [4096, 1024, 1, 2967] - - [573, 9839.11] + - [577, 9839.11] - - [1024, 4096, 1, 3453] - - [566, 9905.71] + - [570, 9905.71] - - [1024, 4096, 1, 3640] - - [573, 9933.95] + - [577, 9933.95] - - [4096, 1024, 1, 3291] - - [567, 9860.74] + - [571, 9860.74] - - [1024, 4096, 1, 3350] - - [574, 9917.93] + - [578, 9917.93] - - [4096, 1024, 1, 3417] - - [573, 9864.51] + - [577, 9864.51] - - [64, 135, 480, 135] - - [557, 6265.35] + - [561, 6265.35] - - [1024, 4096, 1, 3467] - - [567, 9906.85] + - [571, 9906.85] - - [1024, 4096, 1, 3491] - - [573, 9933.2] + - [577, 9933.2] - - [1024, 4096, 1, 3822] - - [573, 9938.65] + - [577, 9938.65] - - [4096, 1024, 1, 3292] - - [573, 9849.11] + - [577, 9849.11] - - [1024, 4096, 1, 3231] - - [566, 9905.72] + - [570, 9905.72] - - [1024, 4096, 1, 3364] - - [567, 9930.22] + - [571, 9930.22] - - [1024, 4096, 1, 3995] - - [567, 9943.66] + - [571, 9943.66] - - [1024, 4096, 1, 3545] - - [566, 9928.43] + - [570, 9928.43] - - [1024, 1024, 1, 3876] - - [564, 9002.94] + - [568, 9002.94] - - [1024, 4096, 1, 3186] - - [566, 9920.91] + - [570, 9920.91] - - [4096, 1024, 1, 3432] - - [573, 9875.19] + - [577, 9875.19] - - [64, 84, 752, 85] - - [544, 5704.41] + - [548, 5704.41] - - [4096, 1024, 1, 3367] - - [567, 9867.96] + - [571, 9867.96] - - [4096, 1024, 1, 3503] - - [574, 9870.91] + - [578, 9870.91] - - [1024, 4096, 1, 3095] - - [567, 9902.8] + - [571, 9902.8] - - [4096, 1024, 1, 3465] - - [574, 9872.07] + - [578, 9872.07] - - [1024, 4096, 1, 3402] - - [573, 9914.56] + - [577, 9914.56] - - [4096, 1024, 1, 3140] - - [573, 9847.85] + - [577, 9847.85] - - [1024, 1024, 1, 4050] - - [570, 9055.65] + - [574, 9055.65] - - [4096, 1024, 1, 3424] - - [567, 9894.52] + - [571, 9894.52] - - [4096, 1024, 1, 3257] - - [566, 9860.87] + - [570, 9860.87] - - [4096, 1024, 1, 2917] - - [573, 9845.81] + - [577, 9845.81] - - [1024, 33708, 1, 3640] - - [566, 10321.6] + - [570, 10321.6] - - [1024, 4096, 1, 3456] - - [566, 9950.25] + - [570, 9950.25] - - [1024, 4096, 1, 3014] - - [566, 9907.87] + - [570, 9907.87] - - [4096, 1024, 1, 3372] - - [574, 9868.27] + - [578, 9868.27] - - [64, 132, 480, 132] - - [557, 6121.52] + - [561, 6121.52] - - [1024, 4096, 1, 3294] - - [574, 9903.13] + - [578, 9903.13] - - [4096, 1024, 1, 3446] - - [574, 9871.59] + - [578, 9871.59] - - [1024, 4096, 1, 3389] - - [567, 9909.17] + - [571, 9909.17] - - [4096, 1024, 1, 3259] - - [573, 9860.66] + - [577, 9860.66] - - [4096, 1024, 1, 3544] - - [573, 9878.66] + - [577, 9878.66] - - [4096, 1024, 1, 3479] - - [574, 9873.87] + - [578, 9873.87] - - [4096, 1024, 1, 3542] - - [573, 9878.87] + - [577, 9878.87] - - [4096, 1024, 1, 3321] - - [566, 9861.03] + - [570, 9861.03] - - [1024, 4096, 1, 3147] - - [566, 9894.67] + - [570, 9894.67] - - [1024, 4096, 1, 3944] - - [566, 9950.41] + - [570, 9950.41] - - [4096, 1024, 1, 3870] - - [574, 9881.64] + - [578, 9881.64] - - [1024, 4096, 1, 3308] - - [566, 9907.16] + - [570, 9907.16] - - [4096, 1024, 1, 3401] - - [573, 9864.49] + - [577, 9864.49] - - [1024, 4096, 1, 3395] - - [566, 9928.93] + - [570, 9928.93] - - [64, 99, 624, 102] - - [542, 5651.26] + - [546, 5651.26] - - [1024, 4096, 1, 3563] - - [573, 9922.66] + - [577, 9922.66] - - [1024, 33708, 1, 3870] - - [566, 10325.3] + - [570, 10325.3] - - [4096, 1024, 1, 3494] - - [573, 9875.27] + - [577, 9875.27] - - [1024, 4096, 1, 3271] - - [566, 9912.99] + - [570, 9912.99] - - [1024, 33708, 1, 3910] - - [566, 10341.4] + - [570, 10341.4] - - [1024, 4096, 1, 3287] - - [574, 9924.77] + - [578, 9924.77] - - [1024, 33708, 1, 3860] - - [566, 10330.6] + - [570, 10330.6] - - [64, 143, 432, 148] - - [559, 6571.68] + - [563, 6571.68] - - [1024, 1024, 1, 3584] - - [571, 8975.21] + - [575, 8975.21] - - [64, 162, 400, 162] - - [561, 6822.16] + - [565, 6822.16] - - [4096, 1024, 1, 3341] - - [573, 9854.56] + - [577, 9854.56] - - [1024, 4096, 1, 3136] - - [566, 9926.76] + - [570, 9926.76] - - [4096, 1024, 1, 3439] - - [573, 9854.23] + - [577, 9854.23] - - [64, 148, 432, 147] - - [557, 6677.51] + - [561, 6677.51] - - [1024, 4096, 1, 3751] - - [573, 9938.38] + - [577, 9938.38] - - [1024, 4096, 1, 3301] - - [573, 9919.05] + - [577, 9919.05] - - [4096, 1024, 1, 3468] - - [574, 9859.73] + - [578, 9859.73] - - [1024, 4096, 1, 3416] - - [574, 9918.42] + - [578, 9918.42] - - [4096, 1024, 1, 3163] - - [573, 9854.55] + - [577, 9854.55] - - [1024, 4096, 1, 3230] - - [567, 9897.44] + - [571, 9897.44] - - [1024, 4096, 1, 3581] - - [567, 9915.38] + - [571, 9915.38] - - [1024, 1024, 1, 3960] - - [569, 9045.76] + - [573, 9045.76] - - [4096, 1024, 1, 3463] - - [574, 9884.64] + - [578, 9884.64] - - [1024, 4096, 1, 3478] - - [567, 9926.92] + - [571, 9926.92] - - [4096, 1024, 1, 3262] - - [573, 9852.12] + - [577, 9852.12] - - [1024, 4096, 1, 3438] - - [573, 9912.58] + - [577, 9912.58] - - [1024, 4096, 1, 3244] - - [566, 9900.41] + - [570, 9900.41] - - [1024, 4096, 1, 3445] - - [566, 9920.22] + - [570, 9920.22] - - [4096, 1024, 1, 3328] - - [573, 9887.97] + - [577, 9887.97] - - [1024, 4096, 1, 3492] - - [567, 9937.12] + - [571, 9937.12] - - [4096, 1024, 1, 3211] - - [567, 9847.85] + - [571, 9847.85] - - [1024, 4096, 1, 3910] - - [574, 9946.47] + - [578, 9946.47] - - [1024, 4096, 1, 3314] - - [566, 9932.5] + - [570, 9932.5] - - [4096, 1024, 1, 3859] - - [573, 9902.74] + - [577, 9902.74] - - [4096, 1024, 1, 3383] - - [573, 9875.1] + - [577, 9875.1] - - [1024, 4096, 1, 3409] - - [574, 9926.69] + - [578, 9926.69] - - [1024, 4096, 1, 4020] - - [566, 9941.7] + - [570, 9941.7] - - [4096, 1024, 1, 3530] - - [573, 9872.71] + - [577, 9872.71] - - [4096, 1024, 1, 3411] - - [574, 9874.92] + - [578, 9874.92] - - [1024, 4096, 1, 3566] - - [574, 9921.0] + - [578, 9921.0] - - [4096, 1024, 1, 3493] - - [566, 9875.64] + - [570, 9875.64] - - [4096, 1024, 1, 3184] - - [573, 9873.04] + - [577, 9873.04] - - [1024, 4096, 1, 3072] - - [566, 9923.69] + - [570, 9923.69] - - [1024, 4096, 1, 3431] - - [567, 9910.93] + - [571, 9910.93] - - [4096, 1024, 1, 3306] - - [574, 9853.32] + - [578, 9853.32] - - [1024, 4096, 1, 3352] - - [574, 9913.22] + - [578, 9913.22] - - [4096, 1024, 1, 3295] - - [573, 9862.58] + - [577, 9862.58] - - [64, 123, 528, 122] - - [539, 6950.15] + - [543, 6950.15] - - [1024, 4096, 1, 3517] - - [567, 9919.96] + - [571, 9919.96] - - [64, 102, 624, 101] - - [547, 5791.39] + - [551, 5791.39] - - [4096, 1024, 1, 3426] - - [573, 9891.04] + - [577, 9891.04] - - [4096, 1024, 1, 3385] - - [573, 9868.31] + - [577, 9868.31] - - [1024, 1024, 1, 3978] - - [564, 9008.38] + - [568, 9008.38] - - [4096, 1024, 1, 3572] - - [566, 9884.71] + - [570, 9884.71] - - [4096, 1024, 1, 3459] - - [573, 9892.07] + - [577, 9892.07] - - [1024, 4096, 1, 3374] - - [574, 9908.42] + - [578, 9908.42] - - [4096, 1024, 1, 3166] - - [573, 9832.35] + - [577, 9832.35] - - [4096, 1024, 1, 3093] - - [574, 9841.15] + - [578, 9841.15] - - [4096, 1024, 1, 3523] - - [567, 9878.95] + - [571, 9878.95] - - [4096, 1024, 1, 3413] - - [567, 9880.71] + - [571, 9880.71] - - [1024, 4096, 1, 3996] - - [566, 9948.04] + - [570, 9948.04] - - [1024, 4096, 1, 3452] - - [574, 9915.87] + - [578, 9915.87] - - [4096, 1024, 1, 3232] - - [574, 9876.44] + - [578, 9876.44] - - [4096, 1024, 1, 3400] - - [566, 9867.05] + - [570, 9867.05] - - [4096, 1024, 1, 3334] - - [573, 9868.89] + - [577, 9868.89] - - [1024, 4096, 1, 3345] - - [566, 9920.5] + - [570, 9920.5] - - [1024, 4096, 1, 3538] - - [573, 9933.24] + - [577, 9933.24] - - [1024, 4096, 1, 3466] - - [573, 9920.75] + - [577, 9920.75] - - [4096, 1024, 1, 3315] - - [573, 9876.77] + - [577, 9876.77] - - [4096, 1024, 1, 3214] - - [574, 9847.83] + - [578, 9847.83] - - [1024, 33708, 1, 3900] - - [566, 10331.6] + - [570, 10331.6] - - [64, 160, 400, 160] - - [559, 7440.51] + - [563, 7440.51] - - [1024, 4096, 1, 3367] - - [573, 9926.22] + - [577, 9926.22] - - [1024, 4096, 1, 2917] - - [574, 9904.47] + - [578, 9904.47] - - [1024, 1024, 1, 3995] - - [565, 9000.23] + - [569, 9000.23] - - [64, 132, 480, 134] - - [557, 6146.78] + - [561, 6146.78] - - [1024, 4096, 1, 3544] - - [574, 9924.04] + - [578, 9924.04] - - [4096, 1024, 1, 3414] - - [574, 9867.8] + - [578, 9867.8] - - [4096, 1024, 1, 3565] - - [567, 9870.03] + - [571, 9870.03] - - [1024, 4096, 1, 3512] - - [573, 9919.74] + - [577, 9919.74] - - [1024, 4096, 1, 3191] - - [574, 9914.69] + - [578, 9914.69] - - [64, 27, 2336, 27] - - [541, 3054.61] + - [545, 3054.61] - - [1024, 4096, 1, 3289] - - [574, 9917.1] + - [578, 9917.1] - - [4096, 1024, 1, 3290] - - [573, 9858.31] + - [577, 9858.31] - - [1024, 4096, 1, 3211] - - [574, 9897.06] + - [578, 9897.06] - - [1024, 33708, 1, 3969] - - [567, 10336.0] + - [571, 10336.0] - - [4096, 1024, 1, 3566] - - [573, 9862.9] + - [577, 9862.9] - - [64, 111, 576, 111] - - [547, 6400.81] + - [551, 6400.81] - - [1024, 4096, 1, 3459] - - [573, 9922.93] + - [577, 9922.93] - - [1024, 4096, 1, 3372] - - [566, 9909.76] + - [570, 9909.76] - - [4096, 1024, 1, 3339] - - [573, 9859.2] + - [577, 9859.2] - - [4096, 1024, 1, 3425] - - [573, 9889.24] + - [577, 9889.24] - - [4096, 1024, 1, 3388] - - [573, 9871.57] + - [577, 9871.57] - - [1024, 4096, 1, 3531] - - [566, 9918.9] + - [570, 9918.9] - - [4096, 1024, 1, 3286] - - [574, 9868.32] + - [578, 9868.32] - - [4096, 1024, 1, 3462] - - [573, 9881.78] + - [577, 9881.78] - - [1024, 4096, 1, 3388] - - [566, 9904.59] + - [570, 9904.59] - - [4096, 1024, 1, 3165] - - [566, 9836.23] + - [570, 9836.23] - - [4096, 1024, 1, 3304] - - [573, 9857.45] + - [577, 9857.45] - - [1024, 4096, 1, 2736] - - [573, 9900.97] + - [577, 9900.97] - - [4096, 1024, 1, 3397] - - [573, 9872.0] + - [577, 9872.0] - - [64, 38, 1680, 38] - - [540, 3459.42] + - [544, 3459.42] - - [1024, 4096, 1, 3311] - - [574, 9908.22] + - [578, 9908.22] - - [1024, 4096, 1, 3394] - - [574, 9929.33] + - [578, 9929.33] - - [4096, 1024, 1, 2736] - - [573, 9833.78] + - [577, 9833.78] - - [1024, 4096, 1, 3559] - - [567, 9925.23] + - [571, 9925.23] - - [4096, 1024, 1, 3180] - - [573, 9837.95] + - [577, 9837.95] - - [1024, 4096, 1, 3480] - - [566, 9922.36] + - [570, 9922.36] - - [4096, 1024, 1, 3318] - - [573, 9867.77] + - [577, 9867.77] - - [4096, 1024, 1, 3213] - - [573, 9845.92] + - [577, 9845.92] - - [1024, 4096, 1, 3286] - - [573, 9912.04] + - [577, 9912.04] - - [4096, 1024, 1, 3471] - - [573, 9874.14] + - [577, 9874.14] - - [1024, 4096, 1, 3381] - - [574, 9922.86] + - [578, 9922.86] - - [64, 100, 624, 100] - - [548, 5705.14] + - [552, 5705.14] - - [4096, 1024, 1, 3502] - - [573, 9872.34] + - [577, 9872.34] - - [64, 16, 3840, 16] - - [554, 2091.57] + - [558, 2091.57] - - [1024, 4096, 1, 3552] - - [566, 9943.79] + - [570, 9943.79] - - [4096, 1024, 1, 3519] - - [574, 9869.85] + - [578, 9869.85] - - [1024, 4096, 1, 3300] - - [567, 9916.05] + - [571, 9916.05] - - [1024, 4096, 1, 3419] - - [566, 9913.96] + - [570, 9913.96] - - [4096, 1024, 1, 4030] - - [567, 9893.63] + - [571, 9893.63] - - [4096, 1024, 1, 3976] - - [574, 9898.25] + - [578, 9898.25] - - [1024, 4096, 1, 3473] - - [574, 9928.32] + - [578, 9928.32] - - [1024, 1024, 1, 3977] - - [571, 9009.23] + - [575, 9009.23] - - [4096, 1024, 1, 3428] - - [573, 9876.69] + - [577, 9876.69] - - [1024, 4096, 1, 3433] - - [567, 9923.82] + - [571, 9923.82] - - [4096, 1024, 1, 3534] - - [567, 9863.9] + - [571, 9863.9] - - [4096, 1024, 1, 3461] - - [573, 9873.02] + - [577, 9873.02] - - [4096, 1024, 1, 3681] - - [573, 9898.47] + - [577, 9898.47] - - [4096, 1024, 1, 3495] - - [574, 9875.98] + - [578, 9875.98] - - [4096, 1024, 1, 3351] - - [573, 9879.61] + - [577, 9879.61] - - [1024, 4096, 1, 4059] - - [566, 9948.51] + - [570, 9948.51] - - [4096, 1024, 1, 3990] - - [573, 9900.66] + - [577, 9900.66] - - [1024, 4096, 1, 3325] - - [567, 9903.2] + - [571, 9903.2] - - [1024, 4096, 1, 3408] - - [573, 9932.05] + - [577, 9932.05] - - [64, 59, 1088, 59] - - [547, 5343.67] + - [551, 5343.67] - - [4096, 1024, 1, 3394] - - [574, 9878.07] + - [578, 9878.07] - - [1024, 4096, 1, 3573] - - [574, 9935.2] + - [578, 9935.2] - - [4096, 1024, 1, 3386] - - [573, 9866.28] + - [577, 9866.28] - - [4096, 1024, 1, 3540] - - [573, 9882.23] + - [577, 9882.23] - - [1024, 4096, 1, 3182] - - [567, 9894.35] + - [571, 9894.35] - - [1024, 4096, 1, 3430] - - [566, 9915.14] + - [570, 9915.14] - - [1024, 4096, 1, 3236] - - [574, 9920.46] + - [578, 9920.46] - - [4096, 1024, 1, 2977] - - [573, 9847.98] + - [577, 9847.98] - - [1024, 4096, 1, 3355] - - [573, 9908.68] + - [577, 9908.68] - - [4096, 1024, 1, 3139] - - [573, 9850.61] + - [577, 9850.61] - - [4096, 1024, 1, 3516] - - [567, 9874.11] + - [571, 9874.11] - - [4096, 1024, 1, 3368] - - [567, 9872.54] + - [571, 9872.54] - - [4096, 1024, 1, 3559] - - [566, 9884.22] + - [570, 9884.22] - - [64, 11, 5456, 11] - - [554, 1382.57] + - [558, 1382.57] - - [1024, 4096, 1, 3506] - - [573, 9937.59] + - [577, 9937.59] - - [1024, 4096, 1, 3145] - - [566, 9905.01] + - [570, 9905.01] - - [1024, 4096, 1, 3369] - - [573, 9912.61] + - [577, 9912.61] - - [64, 112, 576, 112] - - [539, 6583.46] + - [543, 6583.46] - - [4096, 1024, 1, 3522] - - [573, 9889.37] + - [577, 9889.37] - - [1024, 33708, 1, 3894] - - [566, 10337.4] + - [570, 10337.4] - - [64, 159, 400, 162] - - [557, 7056.99] + - [561, 7056.99] - - [4096, 1024, 1, 3336] - - [573, 9867.57] + - [577, 9867.57] - - [1024, 4096, 1, 3382] - - [567, 9915.8] + - [571, 9915.8] - - [4096, 1024, 1, 3533] - - [573, 9878.46] + - [577, 9878.46] - - [4096, 1024, 1, 4050] - - [574, 9916.72] + - [578, 9916.72] - - [4096, 1024, 1, 3480] - - [567, 9869.22] + - [571, 9869.22] - - [1024, 4096, 1, 3344] - - [566, 9935.51] + - [570, 9935.51] - - [64, 122, 528, 122] - - [539, 6871.04] + - [543, 6871.04] - - [1024, 4096, 1, 3509] - - [567, 9925.7] + - [571, 9925.7] - - [1024, 4096, 1, 3956] - - [566, 9958.16] + - [570, 9958.16] - - [4096, 1024, 1, 3616] - - [573, 9904.53] + - [577, 9904.53] - - [1024, 4096, 1, 3366] - - [566, 9919.37] + - [570, 9919.37] - - [4096, 1024, 1, 2935] - - [566, 9833.13] + - [570, 9833.13] - - [4096, 1024, 1, 3393] - - [573, 9877.35] + - [577, 9877.35] - - [4096, 1024, 1, 3547] - - [567, 9865.0] + - [571, 9865.0] - - [1024, 4096, 1, 3499] - - [574, 9912.39] + - [578, 9912.39] - - [4096, 1024, 1, 3357] - - [573, 9855.18] + - [577, 9855.18] - - [4096, 1024, 1, 3272] - - [573, 9861.87] + - [577, 9861.87] - - [4096, 1024, 1, 3207] - - [573, 9847.68] + - [577, 9847.68] - - [4096, 1024, 1, 3894] - - [573, 9918.76] + - [577, 9918.76] - - [1024, 4096, 1, 3444] - - [573, 9932.61] + - [577, 9932.61] - - [4096, 1024, 1, 3561] - - [573, 9872.51] + - [577, 9872.51] - - [4096, 1024, 1, 3376] - - [573, 9885.49] + - [577, 9885.49] - - [1024, 4096, 1, 3458] - - [573, 9929.29] + - [577, 9929.29] - - [4096, 1024, 1, 3231] - - [567, 9846.98] + - [571, 9846.98] - - [64, 228, 272, 228] - - [568, 7302.59] + - [572, 7302.59] - - [1024, 4096, 1, 3505] - - [574, 9931.53] + - [578, 9931.53] - - [4096, 1024, 1, 3277] - - [573, 9857.1] + - [577, 9857.1] - - [64, 21, 2976, 21] - - [543, 2436.04] + - [547, 2436.04] - - [1024, 4096, 1, 3391] - - [573, 9911.15] + - [577, 9911.15] - - [64, 32, 1984, 32] - - [555, 3572.07] + - [559, 3572.07] - - [1024, 4096, 1, 3536] - - [574, 9946.8] + - [578, 9946.8] - - [1024, 4096, 1, 3063] - - [573, 9906.82] + - [577, 9906.82] - - [1024, 1024, 1, 3925] - - [565, 9011.35] + - [569, 9011.35] - - [1024, 4096, 1, 3189] - - [567, 9900.85] + - [571, 9900.85] - - [1024, 4096, 1, 2505] - - [573, 9854.75] + - [577, 9854.75] - - [4096, 1024, 1, 3454] - - [566, 9864.86] + - [570, 9864.86] - - [1024, 4096, 1, 3405] - - [574, 9906.23] + - [578, 9906.23] - - [1024, 33708, 1, 4050] - - [567, 10343.6] + - [571, 10343.6] - - [4096, 1024, 1, 3520] - - [573, 9886.93] + - [577, 9886.93] - - [64, 93, 688, 93] - - [550, 6222.76] + - [554, 6222.76] - - [1024, 4096, 1, 3487] - - [574, 9918.59] + - [578, 9918.59] - - [1024, 4096, 1, 3558] - - [574, 9930.89] + - [578, 9930.89] - - [4096, 1024, 1, 3297] - - [573, 9874.21] + - [577, 9874.21] - - [1024, 1024, 1, 3840] - - [569, 9075.32] + - [573, 9075.32] - - [1024, 4096, 1, 3483] - - [573, 9915.28] + - [577, 9915.28] - - [1024, 1024, 1, 3956] - - [572, 9009.93] + - [576, 9009.93] - - [1024, 33708, 1, 3751] - - [567, 10325.8] + - [571, 10325.8] - - [4096, 1024, 1, 3380] - - [573, 9888.37] + - [577, 9888.37] - - [1024, 4096, 1, 3380] - - [566, 9927.15] + - [570, 9927.15] - - [1024, 4096, 1, 3396] - - [574, 9931.86] + - [578, 9931.86] - - [1024, 4096, 1, 3497] - - [567, 9914.76] + - [571, 9914.76] - - [1024, 4096, 1, 3502] - - [574, 9921.42] + - [578, 9921.42] - - [1024, 1024, 1, 3976] - - [569, 9060.2] + - [573, 9060.2] - - [1024, 4096, 1, 3138] - - [567, 9908.56] + - [571, 9908.56] - - [4096, 1024, 1, 3939] - - [566, 9910.13] + - [570, 9910.13] - - [1024, 4096, 1, 3303] - - [567, 9916.54] + - [571, 9916.54] - - [64, 111, 576, 112] - - [547, 6495.09] + - [551, 6495.09] - - [1024, 4096, 1, 3418] - - [573, 9913.25] + - [577, 9913.25] - - [1024, 4096, 1, 3224] - - [567, 9903.95] + - [571, 9903.95] - - [4096, 1024, 1, 3978] - - [573, 9896.18] + - [577, 9896.18] - - [1024, 4096, 1, 3472] - - [566, 9937.38] + - [570, 9937.38] - - [4096, 1024, 1, 3353] - - [574, 9863.87] + - [578, 9863.87] - - [4096, 1024, 1, 3362] - - [573, 9870.96] + - [577, 9870.96] - - [1024, 33708, 1, 3978] - - [566, 10325.3] + - [570, 10325.3] - - [64, 100, 624, 102] - - [542, 5695.57] + - [546, 5695.57] - - [1024, 4096, 1, 3432] - - [574, 9915.46] + - [578, 9915.46] - - [1024, 4096, 1, 3139] - - [573, 9914.11] + - [577, 9914.11] - - [1024, 4096, 1, 3341] - - [574, 9912.0] + - [578, 9912.0] - - [1024, 4096, 1, 3494] - - [567, 9924.5] + - [571, 9924.5] - - [1024, 4096, 1, 3969] - - [566, 9952.18] + - [570, 9952.18] - - [1024, 4096, 1, 3163] - - [574, 9911.69] + - [578, 9911.69] - - [1024, 1024, 1, 3955] - - [564, 9097.76] + - [568, 9097.76] - - [4096, 1024, 1, 3405] - - [573, 9853.74] + - [577, 9853.74] - - [1024, 1024, 1, 4030] - - [564, 9083.76] + - [568, 9083.76] - - [4096, 1024, 1, 3453] - - [573, 9858.78] + - [577, 9858.78] - - [1024, 4096, 1, 3411] - - [574, 9926.44] + - [578, 9926.44] - - [1024, 4096, 1, 3527] - - [567, 9922.55] + - [571, 9922.55] - - [4096, 1024, 1, 3474] - - [573, 9878.39] + - [577, 9878.39] - - [1024, 4096, 1, 3572] - - [573, 9931.9] + - [577, 9931.9] - - [4096, 1024, 1, 3293] - - [573, 9848.16] + - [577, 9848.16] - - [4096, 1024, 1, 3247] - - [573, 9861.35] + - [577, 9861.35] - - [64, 15, 4096, 15] - - [554, 1955.65] + - [558, 1955.65] - - [1024, 4096, 1, 3425] - - [574, 9936.3] + - [578, 9936.3] - - [1024, 4096, 1, 3354] - - [566, 9917.45] + - [570, 9917.45] - - [4096, 1024, 1, 3382] - - [573, 9885.39] + - [577, 9885.39] - - [4096, 1024, 1, 3236] - - [573, 9860.5] + - [577, 9860.5] - - [1024, 4096, 1, 3519] - - [574, 9919.2] + - [578, 9919.2] - - [4096, 1024, 1, 3354] - - [573, 9854.65] + - [577, 9854.65] - - [4096, 1024, 1, 3501] - - [574, 9869.52] + - [578, 9869.52] - - [1024, 1024, 1, 3906] - - [572, 9104.89] + - [576, 9104.89] - - [4096, 1024, 1, 3266] - - [573, 9873.87] + - [577, 9873.87] - - [64, 101, 624, 102] - - [542, 5765.42] + - [546, 5765.42] - - [1024, 4096, 1, 3368] - - [573, 9909.67] + - [577, 9909.67] - - [1024, 4096, 1, 4030] - - [574, 9940.17] + - [578, 9940.17] - - [1024, 4096, 1, 3533] - - [567, 9916.54] + - [571, 9916.54] - - [4096, 1024, 1, 3332] - - [574, 9876.35] + - [578, 9876.35] - - [4096, 1024, 1, 3584] - - [573, 9896.5] + - [577, 9896.5] - - [1024, 4096, 1, 3616] - - [573, 9957.08] + - [577, 9957.08] - - [4096, 1024, 1, 3265] - - [573, 9877.68] + - [577, 9877.68] - - [4096, 1024, 1, 3361] - - [573, 9888.51] + - [577, 9888.51] - - [4096, 1024, 1, 3467] - - [573, 9863.3] + - [577, 9863.3] - - [1024, 4096, 1, 3454] - - [567, 9904.79] + - [571, 9904.79] - - [1024, 4096, 1, 3101] - - [574, 9893.02] + - [578, 9893.02] - - [1024, 4096, 1, 3508] - - [574, 9931.44] + - [578, 9931.44] - - [4096, 1024, 1, 3267] - - [573, 9864.38] + - [577, 9864.38] - - [64, 54, 1184, 54] - - [539, 4905.92] + - [543, 4905.92] - - [4096, 1024, 1, 3419] - - [573, 9872.46] + - [577, 9872.46] - - [4096, 1024, 1, 3822] - - [573, 9892.53] + - [577, 9892.53] - - [1024, 4096, 1, 3266] - - [573, 9918.48] + - [577, 9918.48] - - [4096, 1024, 1, 3440] - - [574, 9890.06] + - [578, 9890.06] - - [1024, 4096, 1, 3361] - - [573, 9930.87] + - [577, 9930.87] - - [1024, 4096, 1, 3546] - - [567, 9926.46] + - [571, 9926.46] - - [4096, 1024, 1, 3473] - - [573, 9888.96] + - [577, 9888.96] - - [4096, 1024, 1, 3546] - - [574, 9872.17] + - [578, 9872.17] - - [1024, 4096, 1, 3088] - - [567, 9917.93] + - [571, 9917.93] - - [1024, 4096, 1, 3535] - - [574, 9921.1] + - [578, 9921.1] - - [1024, 4096, 1, 3447] - - [574, 9920.53] + - [578, 9920.53] - - [1024, 4096, 1, 3560] - - [573, 9925.38] + - [577, 9925.38] - - [1024, 4096, 1, 3422] - - [567, 9922.11] + - [571, 9922.11] - - [1024, 4096, 1, 3469] - - [566, 9906.08] + - [570, 9906.08] - - [4096, 1024, 1, 3488] - - [573, 9903.16] + - [577, 9903.16] - - [1024, 4096, 1, 3110] - - [573, 9906.66] + - [577, 9906.66] - - [1024, 4096, 1, 3265] - - [574, 9916.59] + - [578, 9916.59] - - [1024, 4096, 1, 3291] - - [573, 9902.63] + - [577, 9902.63] - - [1024, 4096, 1, 3390] - - [574, 9907.12] + - [578, 9907.12] - - [4096, 1024, 1, 3046] - - [573, 9847.58] + - [577, 9847.58] - - [1024, 4096, 1, 3539] - - [574, 9933.39] + - [578, 9933.39] - - [4096, 1024, 1, 3221] - - [574, 9860.64] + - [578, 9860.64] - - [4096, 1024, 1, 3433] - - [573, 9872.64] + - [577, 9872.64] - - [4096, 1024, 1, 3364] - - [574, 9881.81] + - [578, 9881.81] - - [4096, 1024, 1, 3470] - - [573, 9858.46] + - [577, 9858.46] - - [1024, 4096, 1, 3404] - - [566, 9907.17] + - [570, 9907.17] - - [1024, 33708, 1, 3968] - - [567, 10350.2] + - [571, 10350.2] - - [4096, 1024, 1, 3088] - - [573, 9868.96] + - [577, 9868.96] - - [1024, 4096, 1, 3247] - - [573, 9900.92] + - [577, 9900.92] - - [1024, 33708, 1, 3996] - - [566, 10328.4] + - [570, 10328.4] - - [4096, 1024, 1, 3482] - - [574, 9866.89] + - [578, 9866.89] - - [1024, 1024, 1, 3796] - - [569, 9031.58] + - [573, 9031.58] - - [4096, 1024, 1, 3995] - - [574, 9896.68] + - [578, 9896.68] - - [1024, 1024, 1, 3859] - - [571, 9097.26] + - [575, 9097.26] - - [1024, 4096, 1, 3280] - - [567, 9933.95] + - [571, 9933.95] - - [4096, 1024, 1, 3271] - - [574, 9859.99] + - [578, 9859.99] - - [64, 10, 5952, 10] - - [554, 1220.92] + - [558, 1220.92] - - [4096, 1024, 1, 3545] - - [573, 9877.25] + - [577, 9877.25] - - [4096, 1024, 1, 3476] - - [566, 9882.47] + - [570, 9882.47] - - [4096, 1024, 1, 3496] - - [567, 9880.4] + - [571, 9880.4] - - [4096, 1024, 1, 3191] - - [567, 9858.6] + - [571, 9858.6] - - [4096, 1024, 1, 3311] - - [574, 9853.1] + - [578, 9853.1] - - [1024, 4096, 1, 3302] - - [574, 9919.22] + - [578, 9919.22] - - [1024, 4096, 1, 3681] - - [573, 9944.89] + - [577, 9944.89] - - [4096, 1024, 1, 3582] - - [566, 9869.67] + - [570, 9869.67] - - [4096, 1024, 1, 3421] - - [574, 9855.98] + - [578, 9855.98] - - [4096, 1024, 1, 3560] - - [567, 9884.38] + - [571, 9884.38] - - [1024, 4096, 1, 3495] - - [574, 9930.03] + - [578, 9930.03] - - [4096, 1024, 1, 3186] - - [573, 9870.49] + - [577, 9870.49] - - [4096, 1024, 1, 3925] - - [573, 9903.9] + - [577, 9903.9] - - [64, 71, 896, 71] - - [558, 5004.69] + - [562, 5004.69] - - [1024, 4096, 1, 3435] - - [574, 9916.48] + - [578, 9916.48] - - [4096, 1024, 1, 3434] - - [573, 9871.19] + - [577, 9871.19] - - [1024, 33708, 1, 4012] - - [566, 10332.4] + - [570, 10332.4] - - [1024, 4096, 1, 3340] - - [566, 9918.01] + - [570, 9918.01] - - [1024, 1024, 1, 3860] - - [564, 8999.26] + - [568, 8999.26] - - [4096, 1024, 1, 3489] - - [573, 9881.92] + - [577, 9881.92] - - [1024, 4096, 1, 3162] - - [574, 9906.18] + - [578, 9906.18] - - [4096, 1024, 1, 3436] - - [573, 9858.02] + - [577, 9858.02] - - [1024, 1024, 1, 4005] - - [570, 9042.96] + - [574, 9042.96] - - [64, 84, 752, 84] - - [543, 5629.83] + - [547, 5629.83] - - [4096, 1024, 1, 3574] - - [573, 9886.6] + - [577, 9886.6] - - [4096, 1024, 1, 3469] - - [566, 9856.16] + - [570, 9856.16] - - [1024, 4096, 1, 3410] - - [567, 9924.64] + - [571, 9924.64] - - [1024, 4096, 1, 3216] - - [566, 9930.57] + - [570, 9930.57] - - [4096, 1024, 1, 3095] - - [573, 9846.91] + - [577, 9846.91] - - [1024, 1024, 1, 3990] - - [572, 9088.94] + - [576, 9088.94] - - [4096, 1024, 1, 3448] - - [573, 9863.84] + - [577, 9863.84] - - [1024, 4096, 1, 3176] - - [574, 9913.91] + - [578, 9913.91] - - [64, 49, 1296, 49] - - [539, 4437.36] + - [543, 4437.36] - - [4096, 1024, 1, 2918] - - [573, 9830.83] + - [577, 9830.83] - - [64, 14, 4368, 14] - - [553, 1802.37] + - [557, 1802.37] - - [1024, 4096, 1, 3424] - - [573, 9933.95] + - [577, 9933.95] - - [4096, 1024, 1, 3402] - - [566, 9863.02] + - [570, 9863.02] - - [4096, 1024, 1, 3145] - - [567, 9856.46] + - [571, 9856.46] - - [64, 134, 480, 134] - - [559, 6183.95] + - [563, 6183.95] - - [1024, 33708, 1, 3976] - - [567, 10330.0] + - [571, 10330.0] - - [4096, 1024, 1, 3518] - - [566, 9855.97] + - [570, 9855.97] - - [4096, 1024, 1, 3110] - - [573, 9856.36] + - [577, 9856.36] - - [4096, 1024, 1, 3325] - - [573, 9852.26] + - [577, 9852.26] - - [1024, 33708, 1, 3999] - - [566, 10329.6] + - [570, 10329.6] - - [4096, 1024, 1, 2985] - - [573, 9837.2] + - [577, 9837.2] - - [1024, 4096, 1, 3371] - - [566, 9912.93] + - [570, 9912.93] - - [4096, 1024, 1, 3342] - - [573, 9863.06] + - [577, 9863.06] - - [4096, 1024, 1, 3141] - - [567, 9849.81] + - [571, 9849.81] - - [4096, 1024, 1, 3532] - - [567, 9866.2] + - [571, 9866.2] - - [64, 78, 816, 78] - - [544, 5316.78] + - [548, 5316.78] - - [1024, 4096, 1, 3169] - - [574, 9910.35] + - [578, 9910.35] - - [1024, 4096, 1, 3514] - - [573, 9917.9] + - [577, 9917.9] - - [4096, 1024, 1, 3780] - - [574, 9899.65] + - [578, 9899.65] - - [1024, 4096, 1, 3098] - - [566, 9901.52] + - [570, 9901.52] - - [1024, 4096, 1, 3449] - - [574, 9919.75] + - [578, 9919.75] - - [1024, 4096, 1, 3222] - - [566, 9917.56] + - [570, 9917.56] - - [1024, 4096, 1, 3346] - - [567, 9912.81] + - [571, 9912.81] - - [4096, 1024, 1, 3064] - - [574, 9848.69] + - [578, 9848.69] - - [4096, 1024, 1, 3511] - - [573, 9873.29] + - [577, 9873.29] - - [4096, 1024, 1, 3384] - - [573, 9870.88] + - [577, 9870.88] - - [4096, 1024, 1, 3356] - - [567, 9853.35] + - [571, 9853.35] - - [1024, 4096, 1, 3796] - - [566, 9940.56] + - [570, 9940.56] - - [4096, 1024, 1, 3427] - - [573, 9883.04] + - [577, 9883.04] - - [4096, 1024, 1, 3390] - - [573, 9863.69] + - [577, 9863.69] - - [4096, 1024, 1, 3573] - - [574, 9885.92] + - [578, 9885.92] - - [4096, 1024, 1, 3456] - - [567, 9890.51] + - [571, 9890.51] - - [1024, 4096, 1, 3360] - - [574, 9938.0] + - [578, 9938.0] - - [1024, 33708, 1, 3977] - - [567, 10327.1] + - [571, 10327.1] - - [1024, 4096, 1, 2918] - - [566, 9902.74] + - [570, 9902.74] - - [4096, 1024, 1, 3975] - - [573, 9905.17] + - [577, 9905.17] - - [4096, 1024, 1, 3525] - - [574, 9879.81] + - [578, 9879.81] - - [4096, 1024, 1, 3398] - - [566, 9873.81] + - [570, 9873.81] - - [4096, 1024, 1, 3640] - - [573, 9885.06] + - [577, 9885.06] - - [1024, 1024, 1, 3999] - - [565, 8995.32] + - [569, 8995.32] - - [4096, 1024, 1, 3014] - - [573, 9841.22] + - [577, 9841.22] - - [1024, 4096, 1, 3446] - - [566, 9917.11] + - [570, 9917.11] - - [1024, 33708, 1, 3796] - - [566, 10338.9] + - [570, 10338.9] - - [4096, 1024, 1, 3101] - - [566, 9827.24] + - [570, 9827.24] - - [4096, 1024, 1, 3563] - - [574, 9862.93] + - [578, 9862.93] - - [4096, 1024, 1, 3539] - - [566, 9889.44] + - [570, 9889.44] - - [4096, 1024, 1, 3182] - - [573, 9833.69] + - [577, 9833.69] - - [1024, 4096, 1, 3468] - - [567, 9912.95] + - [571, 9912.95] - - [4096, 1024, 1, 3312] - - [573, 9889.75] + - [577, 9889.75] - - [4096, 1024, 1, 3215] - - [573, 9853.78] + - [577, 9853.78] - - [4096, 1024, 1, 3910] - - [573, 9894.62] + - [577, 9894.62] - - [1024, 33708, 1, 3780] - - [567, 10331.9] + - [571, 10331.9] - - [1024, 4096, 1, 3290] - - [573, 9914.98] + - [577, 9914.98] - - [1024, 4096, 1, 4012] - - [573, 9942.55] + - [577, 9942.55] - - [1024, 4096, 1, 3385] - - [573, 9915.73] + - [577, 9915.73] - - [1024, 33708, 1, 3975] - - [566, 10330.0] + - [570, 10330.0] - - [4096, 1024, 1, 3996] - - [573, 9891.21] + - [577, 9891.21] - - [4096, 1024, 1, 2765] - - [574, 9800.28] + - [578, 9800.28] - - [4096, 1024, 1, 3538] - - [574, 9886.12] + - [578, 9886.12] - - [4096, 1024, 1, 3415] - - [574, 9874.5] + - [578, 9874.5] - - [1024, 4096, 1, 3554] - - [573, 9931.89] + - [577, 9931.89] - - [4096, 1024, 1, 3513] - - [567, 9874.15] + - [571, 9874.15] - - [1024, 4096, 1, 3304] - - [567, 9907.63] + - [571, 9907.63] - - [4096, 1024, 1, 3294] - - [573, 9851.15] + - [577, 9851.15] - - [4096, 1024, 1, 3396] - - [574, 9880.6] + - [578, 9880.6] - - [1024, 4096, 1, 3213] - - [567, 9891.02] + - [571, 9891.02] - - [4096, 1024, 1, 3137] - - [567, 9857.31] + - [571, 9857.31] - - [4096, 1024, 1, 3552] - - [573, 9904.12] + - [577, 9904.12] - - [1024, 1024, 1, 4020] - - [572, 9098.77] + - [576, 9098.77] - - [64, 13, 4672, 13] - - [554, 1693.44] + - [558, 1693.44] - - [1024, 4096, 1, 3461] - - [573, 9918.35] + - [577, 9918.35] - - [4096, 1024, 1, 3263] - - [566, 9843.79] + - [570, 9843.79] - - [4096, 1024, 1, 3430] - - [573, 9885.16] + - [577, 9885.16] - - [4096, 1024, 1, 3389] - - [573, 9859.13] + - [577, 9859.13] - - [4096, 1024, 1, 3528] - - [573, 9872.91] + - [577, 9872.91] - - [1024, 4096, 1, 3463] - - [574, 9929.51] + - [578, 9929.51] - - [4096, 1024, 1, 3526] - - [574, 9876.8] + - [578, 9876.8] - - [4096, 1024, 1, 3154] - - [573, 9858.15] + - [577, 9858.15] - - [4096, 1024, 1, 3499] - - [574, 9862.82] + - [578, 9862.82] - - [1024, 1024, 1, 3939] - - [572, 9107.31] + - [576, 9107.31] - - [4096, 1024, 1, 3955] - - [574, 9906.18] + - [578, 9906.18] - - [1024, 4096, 1, 3297] - - [567, 9925.24] + - [571, 9925.24] - - [1024, 4096, 1, 3233] - - [573, 9920.55] + - [577, 9920.55] - - [1024, 4096, 1, 3226] - - [573, 9911.25] + - [577, 9911.25] - - [4096, 1024, 1, 3404] - - [573, 9867.18] + - [577, 9867.18] - - [4096, 1024, 1, 3355] - - [573, 9862.56] + - [577, 9862.56] - - [1024, 4096, 1, 3542] - - [573, 9926.39] + - [577, 9926.39] - - [4096, 1024, 1, 3181] - - [574, 9831.76] + - [578, 9831.76] - - [1024, 4096, 1, 3474] - - [573, 9927.93] + - [577, 9927.93] - - [4096, 1024, 1, 3319] - - [573, 9870.18] + - [577, 9870.18] - - [1024, 4096, 1, 3434] - - [566, 9917.41] + - [570, 9917.41] - - [1024, 4096, 1, 3860] - - [573, 9945.22] + - [577, 9945.22] - - [1024, 4096, 1, 3343] - - [566, 9914.56] + - [570, 9914.56] - - [64, 77, 816, 78] - - [544, 5276.87] + - [548, 5276.87] - - [1024, 4096, 1, 3488] - - [573, 9945.71] + - [577, 9945.71] - - [1024, 4096, 1, 3046] - - [573, 9908.68] + - [577, 9908.68] - - [1024, 4096, 1, 3141] - - [574, 9909.08] + - [578, 9909.08] - - [1024, 4096, 1, 3516] - - [574, 9911.28] + - [578, 9911.28] - - [4096, 1024, 1, 3147] - - [573, 9840.37] + - [577, 9840.37] - - [1024, 1024, 1, 4059] - - [565, 9009.68] + - [569, 9009.68] - - [1024, 1024, 1, 3944] - - [565, 9006.07] + - [569, 9006.07] - - [1024, 4096, 1, 3421] - - [574, 9919.76] + - [578, 9919.76] - - [4096, 1024, 1, 3944] - - [567, 9899.43] + - [571, 9899.43] - - [64, 45, 1424, 45] - - [552, 4068.57] + - [556, 4068.57] - - [1024, 4096, 1, 3574] - - [567, 9930.09] + - [571, 9930.09] - - [1024, 4096, 1, 3977] - - [566, 9944.18] + - [570, 9944.18] - - [1024, 1024, 1, 3968] - - [571, 9045.12] + - [575, 9045.12] - - [1024, 4096, 1, 2985] - - [573, 9887.55] + - [577, 9887.55] - - [64, 193, 320, 193] - - [560, 6631.25] + - [564, 6631.25] - - [1024, 4096, 1, 3427] - - [574, 9933.31] + - [578, 9933.31] - - [64, 12, 5040, 12] - - [554, 1552.43] + - [558, 1552.43] - - [1024, 4096, 1, 3482] - - [574, 9942.12] + - [578, 9942.12] - - [1024, 4096, 1, 3332] - - [566, 9923.48] + - [570, 9923.48] - - [1024, 1024, 1, 3720] - - [570, 9039.46] + - [574, 9039.46] - - [4096, 1024, 1, 3308] - - [574, 9852.56] + - [578, 9852.56] - - [1024, 4096, 1, 3513] - - [574, 9919.89] + - [578, 9919.89] - - [1024, 4096, 1, 3154] - - [567, 9908.36] + - [571, 9908.36] - - [1024, 4096, 1, 3955] - - [574, 9949.91] + - [578, 9949.91] - - [1024, 4096, 1, 2967] - - [574, 9897.34] + - [578, 9897.34] - - [1024, 33708, 1, 3942] - - [566, 10336.0] + - [570, 10336.0] - - [1024, 4096, 1, 3319] - - [574, 9912.35] + - [578, 9912.35] - - [4096, 1024, 1, 3860] - - [573, 9909.19] + - [577, 9909.19] - - [1024, 4096, 1, 3548] - - [566, 9924.11] + - [570, 9924.11] - - [4096, 1024, 1, 3977] - - [574, 9891.34] + - [578, 9891.34] - - [4096, 1024, 1, 3535] - - [573, 9867.74] + - [577, 9867.74] - - [1024, 4096, 1, 3541] - - [574, 9923.06] + - [578, 9923.06] - - [1024, 1024, 1, 3910] - - [571, 9080.3] + - [575, 9080.3] - - [1024, 33708, 1, 3584] - - [566, 10332.9] + - [570, 10332.9] - - [1024, 4096, 1, 3168] - - [567, 9926.17] + - [571, 9926.17] - - [1024, 4096, 1, 3448] - - [574, 9922.32] + - [578, 9922.32] - - [4096, 1024, 1, 3343] - - [573, 9857.13] + - [577, 9857.13] - - [64, 35, 1808, 35] - - [556, 3175.34] + - [560, 3175.34] - - [1024, 4096, 1, 3357] - - [567, 9902.31] + - [571, 9902.31] - - [64, 143, 432, 143] - - [557, 6489.6] + - [561, 6489.6] - - [4096, 1024, 1, 3510] - - [573, 9867.3] + - [577, 9867.3] - - [4096, 1024, 1, 3369] - - [573, 9863.34] + - [577, 9863.34] - - [64, 92, 688, 93] - - [544, 6188.2] + - [548, 6188.2] - - [4096, 1024, 1, 3379] - - [573, 9870.02] + - [577, 9870.02] - - [1024, 4096, 1, 3276] - - [573, 9904.67] + - [577, 9904.67] - - [1024, 4096, 1, 3363] - - [573, 9925.03] + - [577, 9925.03] - - [4096, 1024, 1, 3055] - - [573, 9831.82] + - [577, 9831.82] - - [1024, 4096, 1, 3524] - - [566, 9923.69] + - [570, 9923.69] - - [4096, 1024, 1, 3057] - - [573, 9852.77] + - [577, 9852.77] - - [1024, 33708, 1, 3720] - - [567, 10327.0] + - [571, 10327.0] - - [1024, 4096, 1, 3383] - - [566, 9919.29] + - [570, 9919.29] - - [1024, 4096, 1, 3522] - - [567, 9932.46] + - [571, 9932.46] - - [1024, 33708, 1, 3956] - - [566, 10333.7] + - [570, 10333.7] - - [1024, 4096, 1, 3481] - - [566, 9921.98] + - [570, 9921.98] - - [4096, 1024, 1, 3562] - - [574, 9874.76] + - [578, 9874.76] - - [4096, 1024, 1, 3299] - - [573, 9872.87] + - [577, 9872.87] - - [1024, 4096, 1, 3262] - - [567, 9924.73] + - [571, 9924.73] - - [1024, 4096, 1, 3840] - - [566, 9961.74] + - [570, 9961.74] - - [1024, 33708, 1, 4026] - - [566, 10334.2] + - [570, 10334.2] - - [4096, 1024, 1, 3168] - - [567, 9878.35] + - [571, 9878.35] - - [64, 101, 624, 101] - - [547, 5734.62] + - [551, 5734.62] - - [1024, 4096, 1, 3999] - - [566, 9947.0] + - [570, 9947.0] - - [1024, 4096, 1, 3549] - - [566, 9923.2] + - [570, 9923.2] - - [4096, 1024, 1, 3375] - - [573, 9868.79] + - [577, 9868.79] - - [1024, 4096, 1, 3496] - - [574, 9928.57] + - [578, 9928.57] - - [64, 29, 2176, 29] - - [543, 3289.92] + - [547, 3289.92] - - [1024, 4096, 1, 3190] - - [574, 9897.51] + - [578, 9897.51] - - [4096, 1024, 1, 3273] - - [574, 9853.55] + - [578, 9853.55] - - [1024, 4096, 1, 3406] - - [573, 9906.94] + - [577, 9906.94] - - [4096, 1024, 1, 4005] - - [566, 9907.87] + - [570, 9907.87] - - [4096, 1024, 1, 3555] - - [573, 9878.86] + - [577, 9878.86] - - [4096, 1024, 1, 2505] - - [573, 9785.0] + - [577, 9785.0] - - [1024, 4096, 1, 3460] - - [573, 9930.14] + - [577, 9930.14] - - [64, 17, 3632, 17] - - [544, 1917.17] + - [548, 1917.17] - - [1024, 4096, 1, 3579] - - [567, 9920.84] + - [571, 9920.84] - - [1024, 33708, 1, 4030] - - [567, 10327.6] + - [571, 10327.6] - - [1024, 4096, 1, 3510] - - [567, 9931.21] + - [571, 9931.21] - - [1024, 1024, 1, 3969] - - [564, 9020.73] + - [568, 9020.73] - - [1024, 4096, 1, 3282] - - [574, 9919.95] + - [578, 9919.95] - - [1024, 4096, 1, 3377] - - [566, 9927.24] + - [570, 9927.24] - - [1024, 4096, 1, 2935] - - [574, 9903.38] + - [578, 9903.38] - - [64, 41, 1552, 41] - - [544, 3740.38] + - [548, 3740.38] - - [1024, 4096, 1, 3498] - - [566, 9914.91] + - [570, 9914.91] - - [1024, 4096, 1, 3593] - - [573, 9925.54] + - [577, 9925.54] - - [1024, 1024, 1, 3948] - - [572, 9008.93] + - [576, 9008.93] - - [4096, 1024, 1, 3226] - - [574, 9854.65] + - [578, 9854.65] - - [1024, 4096, 1, 2499] - - [573, 9904.72] + - [577, 9904.72] - - [1024, 4096, 1, 3296] - - [566, 9926.79] + - [570, 9926.79] - - [1024, 4096, 1, 3455] - - [573, 9917.42] + - [577, 9917.42] - - [1024, 4096, 1, 3399] - - [567, 9919.6] + - [571, 9919.6] - - [1024, 4096, 1, 3205] - - [566, 9917.64] + - [570, 9917.64] - - [4096, 1024, 1, 4026] - - [574, 9897.71] + - [578, 9897.71] - - [1024, 4096, 1, 3484] - - [566, 9915.43] + - [570, 9915.43] - - [4096, 1024, 1, 3302] - - [574, 9862.7] + - [578, 9862.7] - - [1024, 4096, 1, 3485] - - [574, 9912.9] + - [578, 9912.9] - - [1024, 1024, 1, 3996] - - [572, 9008.67] + - [576, 9008.67] - - [1024, 4096, 1, 3126] - - [567, 9910.06] + - [571, 9910.06] - - [1024, 4096, 1, 4050] - - [566, 9951.11] + - [570, 9951.11] - - [4096, 1024, 1, 3235] - - [567, 9870.64] + - [571, 9870.64] - - [1024, 33708, 1, 3955] - - [566, 10336.0] + - [570, 10336.0] - - [1024, 4096, 1, 3342] - - [566, 9903.75] + - [570, 9903.75] - - [1024, 1024, 1, 3900] - - [571, 9082.82] + - [575, 9082.82] - - [1024, 4096, 1, 3397] - - [574, 9922.6] + - [578, 9922.6] - - [4096, 1024, 1, 3491] - - [574, 9880.65] + - [578, 9880.65] - - [1024, 4096, 1, 3503] - - [566, 9923.18] + - [570, 9923.18] - - [1024, 4096, 1, 3140] - - [567, 9908.31] + - [571, 9908.31] - - [4096, 1024, 1, 3121] - - [573, 9860.22] + - [577, 9860.22] - - [4096, 1024, 1, 3276] - - [573, 9854.09] + - [577, 9854.09] - - [1024, 4096, 1, 3321] - - [574, 9917.76] + - [578, 9917.76] - - [1024, 4096, 1, 3870] - - [574, 9930.97] + - [578, 9930.97] - - [4096, 1024, 1, 3475] - - [573, 9877.48] + - [577, 9877.48] - - [1024, 4096, 1, 2984] - - [573, 9895.49] + - [577, 9895.49] - - [4096, 1024, 1, 3363] - - [567, 9873.34] + - [571, 9873.34] - - [1024, 4096, 1, 3582] - - [573, 9920.77] + - [577, 9920.77] - - [4096, 1024, 1, 3509] - - [573, 9886.76] + - [577, 9886.76] - - [1024, 4096, 1, 3426] - - [566, 9928.76] + - [570, 9928.76] - - [4096, 1024, 1, 3136] - - [573, 9872.51] + - [577, 9872.51] - - [1024, 4096, 1, 3232] - - [574, 9926.19] + - [578, 9926.19] - - [4096, 1024, 1, 3103] - - [573, 9838.93] + - [577, 9838.93] - - [1024, 4096, 1, 3335] - - [567, 9913.27] + - [571, 9913.27] - - [1024, 4096, 1, 3900] - - [566, 9937.91] + - [570, 9937.91] - - [4096, 1024, 1, 3512] - - [567, 9877.16] + - [571, 9877.16] - - [4096, 1024, 1, 3222] - - [573, 9859.67] + - [577, 9859.67] - - [1024, 4096, 1, 3165] - - [573, 9899.61] + - [577, 9899.61] - - [4096, 1024, 1, 3408] - - [573, 9899.58] + - [577, 9899.58] - - [4096, 1024, 1, 3751] - - [573, 9891.39] + - [577, 9891.39] - - [1024, 4096, 1, 3318] - - [566, 9913.32] + - [570, 9913.32] - - [4096, 1024, 1, 3442] - - [574, 9880.11] + - [578, 9880.11] - - [1024, 4096, 1, 3413] - - [573, 9921.8] + - [577, 9921.8] - - [4096, 1024, 1, 3524] - - [573, 9879.12] + - [577, 9879.12] - - [1024, 4096, 1, 3976] - - [574, 9945.47] + - [578, 9945.47] - - [1024, 4096, 1, 3475] - - [574, 9932.41] + - [578, 9932.41] - - [1024, 4096, 1, 3534] - - [566, 9911.39] + - [570, 9911.39] - - [4096, 1024, 1, 3301] - - [573, 9872.65] + - [577, 9872.65] - - [4096, 1024, 1, 3248] - - [573, 9878.12] + - [577, 9878.12] - - [1024, 4096, 1, 2977] - - [567, 9899.83] + - [571, 9899.83] - - [4096, 1024, 1, 3346] - - [573, 9875.97] + - [577, 9875.97] - - [1024, 4096, 1, 3451] - - [566, 9920.06] + - [570, 9920.06] - - [1024, 4096, 1, 3257] - - [567, 9904.92] + - [571, 9904.92] - - [1024, 1024, 1, 3640] - - [565, 8983.29] + - [569, 8983.29] - - [1024, 4096, 1, 3356] - - [566, 9904.38] + - [570, 9904.38] - - [4096, 1024, 1, 3348] - - [574, 9872.43] + - [578, 9872.43] - - [4096, 1024, 1, 3335] - - [573, 9865.72] + - [577, 9865.72] - - [4096, 1024, 1, 3505] - - [573, 9888.78] + - [577, 9888.78] - - [1024, 4096, 1, 3490] - - [566, 9937.9] + - [570, 9937.9] - - [4096, 1024, 1, 3447] - - [573, 9865.29] + - [577, 9865.29] - - [1024, 4096, 1, 3267] - - [574, 9919.22] + - [578, 9919.22] - - [4096, 1024, 1, 3230] - - [573, 9853.1] + - [577, 9853.1] - - [4096, 1024, 1, 3455] - - [573, 9862.34] + - [577, 9862.34] - - [1024, 4096, 1, 3925] - - [566, 9945.54] + - [570, 9945.54] - - [1024, 4096, 1, 3362] - - [567, 9921.53] + - [571, 9921.53] - - [4096, 1024, 1, 3969] - - [574, 9911.88] + - [578, 9911.88] - - [4096, 1024, 1, 3527] - - [573, 9882.77] + - [577, 9882.77] - - [1024, 4096, 1, 3585] - - [567, 9946.42] + - [571, 9946.42] - - [4096, 1024, 1, 3063] - - [573, 9853.93] + - [577, 9853.93] - - [4096, 1024, 1, 3435] - - [573, 9867.03] + - [577, 9867.03] - - [4096, 1024, 1, 3366] - - [574, 9863.92] + - [578, 9863.92] - - [4096, 1024, 1, 3581] - - [566, 9868.47] + - [570, 9868.47] - - [1024, 33708, 1, 3906] - - [566, 10339.2] + - [570, 10339.2] - - [1024, 4096, 1, 3464] - - [574, 9916.11] + - [578, 9916.11] - - [1024, 4096, 1, 3440] - - [573, 9945.15] + - [577, 9945.15] - - [4096, 1024, 1, 3143] - - [573, 9846.66] + - [577, 9846.66] - - [1024, 4096, 1, 3349] - - [567, 9912.73] + - [571, 9912.73] - - [4096, 1024, 1, 3416] - - [573, 9885.03] + - [577, 9885.03] - - [4096, 1024, 1, 3365] - - [573, 9875.9] + - [577, 9875.9] - - [1024, 4096, 1, 3470] - - [574, 9914.88] + - [578, 9914.88] - - [4096, 1024, 1, 3287] - - [573, 9860.59] + - [577, 9860.59] - - [1024, 4096, 1, 3441] - - [574, 9928.88] + - [578, 9928.88] - - [4096, 1024, 1, 3224] - - [573, 9857.73] + - [577, 9857.73] - - [1024, 4096, 1, 3387] - - [566, 9911.62] + - [570, 9911.62] - - [1024, 4096, 1, 3547] - - [566, 9920.26] + - [570, 9920.26] - - [4096, 1024, 1, 3478] - - [567, 9882.8] + - [571, 9882.8] - - [4096, 1024, 1, 3548] - - [574, 9869.35] + - [578, 9869.35] - - [1024, 33708, 1, 4020] - - [566, 10345.2] + - [570, 10345.2] - - [4096, 1024, 1, 3320] - - [573, 9863.64] + - [577, 9863.64] - - [1024, 4096, 1, 3906] - - [573, 9942.57] + - [577, 9942.57] - - [4096, 1024, 1, 3796] - - [573, 9899.03] + - [577, 9899.03] - - [1024, 4096, 1, 3306] - - [566, 9902.3] + - [570, 9902.3] - - [1024, 4096, 1, 3401] - - [574, 9913.85] + - [578, 9913.85] - - [64, 147, 432, 147] - - [557, 6626.5] + - [561, 6626.5] - - [1024, 4096, 1, 3215] - - [574, 9911.14] + - [578, 9911.14] - - [4096, 1024, 1, 4012] - - [574, 9898.1] + - [578, 9898.1] - - [1024, 4096, 1, 2765] - - [574, 9863.63] + - [578, 9863.63] - - [4096, 1024, 1, 3554] - - [567, 9883.42] + - [571, 9883.42] - - [4096, 1024, 1, 3423] - - [573, 9866.62] + - [577, 9866.62] - - [1024, 1024, 1, 3751] - - [571, 9006.26] + - [575, 9006.26] - - [1024, 4096, 1, 3562] - - [567, 9921.98] + - [571, 9921.98] - - [1024, 4096, 1, 3489] - - [566, 9936.68] + - [570, 9936.68] - - [4096, 1024, 1, 3358] - - [573, 9858.12] + - [577, 9858.12] - - [4096, 1024, 1, 3270] - - [574, 9850.74] + - [578, 9850.74] - - [1024, 4096, 1, 3293] - - [566, 9905.23] + - [570, 9905.23] - - [1024, 4096, 1, 3376] - - [566, 9934.88] + - [570, 9934.88] - - [4096, 1024, 1, 3245] - - [573, 9852.42] + - [577, 9852.42] - - [4096, 1024, 1, 3541] - - [573, 9887.12] + - [577, 9887.12] - - [4096, 1024, 1, 3443] - - [573, 9871.63] + - [577, 9871.63] - - [4096, 1024, 1, 3438] - - [574, 9863.76] + - [578, 9863.76] - - [4096, 1024, 1, 3244] - - [573, 9859.66] + - [577, 9859.66] - - [1024, 4096, 1, 3365] - - [573, 9922.0] + - [577, 9922.0] - - [1024, 4096, 1, 3299] - - [567, 9923.28] + - [571, 9923.28] - - [4096, 1024, 1, 3840] - - [573, 9914.65] + - [577, 9914.65] - - [1024, 4096, 1, 3471] - - [574, 9918.28] + - [578, 9918.28] - - [1024, 4096, 1, 3398] - - [566, 9918.89] + - [570, 9918.89] - - [4096, 1024, 1, 3162] - - [573, 9843.83] + - [577, 9843.83] - - [1024, 4096, 1, 4005] - - [567, 9947.77] + - [571, 9947.77] - - [4096, 1024, 1, 3579] - - [573, 9868.15] + - [577, 9868.15] - - [64, 18, 3440, 18] - - [549, 2059.23] + - [553, 2059.23] - - [64, 177, 352, 177] - - [568, 7315.3] + - [572, 7315.3] - - [1024, 4096, 1, 3121] - - [574, 9930.24] + - [578, 9930.24] - - [4096, 1024, 1, 3441] - - [573, 9883.18] + - [577, 9883.18] - - [4096, 1024, 1, 3422] - - [573, 9858.31] + - [577, 9858.31] - - [4096, 1024, 1, 3444] - - [573, 9886.93] + - [577, 9886.93] - - [1024, 4096, 1, 3337] - - [567, 9911.35] + - [571, 9911.35] - - [4096, 1024, 1, 3550] - - [566, 9871.77] + - [570, 9871.77] - - [1024, 4096, 1, 3477] - - [566, 9930.55] + - [570, 9930.55] - - [4096, 1024, 1, 3490] - - [573, 9878.35] + - [577, 9878.35] - - [4096, 1024, 1, 3585] - - [573, 9893.53] + - [577, 9893.53] - - [1024, 4096, 1, 3143] - - [566, 9901.09] + - [570, 9901.09] - - [1024, 33708, 1, 3876] - - [567, 10330.7] + - [571, 10330.7] - - [1024, 4096, 1, 3320] - - [574, 9913.08] + - [578, 9913.08] - - [1024, 4096, 1, 3423] - - [574, 9914.04] + - [578, 9914.04] - - [1024, 4096, 1, 3894] - - [566, 9944.37] + - [570, 9944.37] - - [4096, 1024, 1, 3410] - - [573, 9878.57] + - [577, 9878.57] - - [1024, 4096, 1, 3561] - - [566, 9926.58] + - [570, 9926.58] - - [4096, 1024, 1, 3492] - - [567, 9872.82] + - [571, 9872.82] - - [64, 85, 752, 85] - - [544, 5734.25] + - [548, 5734.25] - - [36548, 1024, 1, 3712] - - [576, 10367.5] + - [580, 10367.5] - - [4096, 2048, 1, 128] - - [577, 8743.83] + - [581, 8743.83] - - [1024, 1024, 1, 3712] - - [578, 9976.19] + - [582, 9976.19] - - [1024, 1024, 1, 128] - - [575, 5765.37] + - [579, 5765.37] - - [4096, 3072, 1, 128] - - [577, 8869.01] + - [581, 8869.01] - - [768, 3072, 1, 4096] - - [589, 10028.7] + - [593, 10028.7] - - [64, 256, 192, 256] - - [583, 8791.55] + - [587, 8791.55] - - [768, 2, 1, 16] - - [586, 4.95484] + - [590, 4.95484] - - [768, 768, 1, 64] - - [582, 3469.55] + - [586, 3469.55] - - [768, 768, 1, 4096] - - [590, 7475.0] + - [594, 7475.0] - - [768, 30522, 1, 1280] - - [593, 10296.9] + - [597, 10296.9] - - [64, 128, 384, 128] - - [583, 7660.83] + - [587, 7660.83] - - [768, 30522, 1, 320] - - [591, 10007.9] + - [595, 10007.9] - - [768, 768, 1, 32] - - [580, 2359.3] + - [584, 2359.3] - - [3072, 768, 1, 4096] - - [589, 10033.7] + - [593, 10033.7] - - [768, 30522, 1, 640] - - [592, 10206.7] + - [596, 10206.7] - - [64, 64, 768, 64] - - [581, 5494.72] + - [585, 5494.72] - - [768, 768, 1, 640] - - [590, 6721.64] + - [594, 6721.64] - - [768, 768, 1, 16] - - [579, 1203.72] + - [583, 1203.72] - - [768, 768, 1, 1280] - - [588, 7138.57] + - [592, 7138.57] - - [768, 2, 1, 32] - - [584, 11.8154] + - [588, 11.8154] - - [2048, 2048, 1, 512] - - [604, 9607.57] + - [608, 9607.57] - - [512, 32, 1, 200] - - [597, 422.268] + - [601, 422.268] - - [1024, 1, 1, 200] - - [600, 24.6154] + - [604, 24.6154] - - [1600, 1024, 1, 512] - - [595, 8115.91] + - [599, 8115.91] - - [560, 1024, 1, 200] - - [594, 4810.74] + - [598, 4810.74] - - [1024, 1024, 1, 512] - - [603, 8614.74] + - [607, 8614.74] - - [2048, 1, 1, 512] - - [598, 80.9086] + - [602, 80.9086] - - [512, 512, 1, 200] - - [596, 4398.39] + - [600, 4398.39] - - [100, 2048, 1, 512] - - [601, 4443.12] + - [605, 4443.12] - - [1024, 1024, 1, 200] - - [602, 6990.51] + - [606, 6990.51] - - [1024, 64, 1, 512] - - [599, 2853.27] + - [603, 2853.27] - - [1024, 256, 1, 18944] - - [623, 9196.41] + - [627, 9196.41] - - [256, 3328, 1, 8976] - - [613, 8299.26] + - [617, 8299.26] - - [1024, 256, 1, 4352] - - [621, 8813.74] + - [625, 8813.74] - - [256, 9728, 1, 8976] - - [616, 9638.48] + - [620, 9638.48] - - [1024, 256, 1, 3072] - - [623, 8640.63] + - [627, 8640.63] - - [768, 2048, 1, 256] - - [615, 8662.93] + - [619, 8662.93] - - [1024, 256, 1, 19968] - - [620, 9220.86] + - [624, 9220.86] - - [256, 12800, 1, 8976] - - [610, 9418.42] + - [614, 9418.42] - - [1024, 256, 1, 3328] - - [624, 8682.48] + - [628, 8682.48] - - [256, 10240, 1, 8976] - - [617, 10137.7] + - [621, 10137.7] - - [1024, 256, 1, 15104] - - [622, 9167.03] + - [626, 9167.03] - - [256, 10496, 1, 8976] - - [610, 9858.38] + - [614, 9858.38] - - [1024, 256, 1, 2816] - - [625, 8575.71] + - [629, 8575.71] - - [1024, 256, 1, 4608] - - [620, 8861.21] + - [624, 8861.21] - - [256, 11264, 1, 8976] - - [607, 9627.69] + - [611, 9627.69] - - [1024, 256, 1, 6400] - - [620, 8985.23] + - [624, 8985.23] - - [1024, 256, 1, 16128] - - [620, 9170.26] + - [624, 9170.26] - - [256, 44505, 1, 8976] - - [614, 10331.8] + - [618, 10331.8] - - [256, 6144, 1, 8976] - - [617, 10395.0] + - [621, 10395.0] - - [1024, 256, 1, 5120] - - [622, 8881.53] + - [626, 8881.53] - - [1024, 256, 1, 7936] - - [625, 9023.14] + - [629, 9023.14] - - [256, 3840, 1, 8976] - - [612, 9541.28] + - [616, 9541.28] - - [1024, 256, 1, 21248] - - [620, 9209.72] + - [624, 9209.72] - - [1024, 256, 1, 12032] - - [622, 9156.17] + - [626, 9156.17] - - [256, 8192, 1, 8976] - - [619, 10374.4] + - [623, 10374.4] - - [1024, 256, 1, 3584] - - [621, 8712.2] + - [625, 8712.2] - - [1024, 256, 1, 14336] - - [622, 9162.51] + - [626, 9162.51] - - [256, 7168, 1, 8976] - - [608, 9554.86] + - [612, 9554.86] - - [1024, 256, 1, 13568] - - [620, 9165.04] + - [624, 9165.04] - - [256, 4096, 1, 8976] - - [612, 10146.6] + - [616, 10146.6] - - [1024, 256, 1, 4096] - - [621, 8783.88] + - [625, 8783.88] - - [256, 2560, 1, 8976] - - [611, 8381.56] + - [615, 8381.56] - - [256, 20992, 1, 8976] - - [610, 9989.86] + - [614, 9989.86] - - [256, 4352, 1, 8976] - - [611, 9634.92] + - [615, 9634.92] - - [256, 33536, 1, 8976] - - [610, 10218.1] + - [614, 10218.1] - - [256, 3584, 1, 8976] - - [612, 8924.5] + - [616, 8924.5] - - [256, 26112, 1, 8976] - - [611, 10272.3] + - [615, 10272.3] - - [256, 14336, 1, 8976] - - [615, 10217.3] + - [619, 10217.3] - - [1024, 256, 1, 14848] - - [622, 9185.19] + - [626, 9185.19] - - [1024, 256, 1, 8448] - - [623, 9025.89] + - [627, 9025.89] - - [1024, 256, 1, 28672] - - [620, 9256.4] + - [624, 9256.4] - - [1024, 256, 1, 5632] - - [620, 8932.69] + - [624, 8932.69] - - [256, 22016, 1, 8976] - - [615, 10151.9] + - [619, 10151.9] - - [1024, 256, 1, 33536] - - [620, 9243.07] + - [624, 9243.07] - - [256, 5120, 1, 8976] - - [606, 9418.05] + - [610, 9418.05] - - [256, 11520, 1, 8976] - - [613, 9701.0] + - [617, 9701.0] - - [256, 19968, 1, 8976] - - [611, 10228.0] + - [615, 10228.0] - - [1024, 256, 1, 5376] - - [622, 8892.52] + - [626, 8892.52] - - [1024, 256, 1, 22016] - - [620, 9244.24] + - [624, 9244.24] - - [256, 8960, 1, 8976] - - [611, 9841.31] + - [615, 9841.31] - - [1024, 256, 1, 15872] - - [620, 9223.15] + - [624, 9223.15] - - [256, 17408, 1, 8976] - - [615, 9785.77] + - [619, 9785.77] - - [256, 5632, 1, 8976] - - [615, 9564.22] + - [619, 9564.22] - - [256, 32512, 1, 8976] - - [614, 10357.9] + - [618, 10357.9] - - [256, 11008, 1, 8976] - - [607, 9445.13] + - [611, 9445.13] - - [1024, 256, 1, 6144] - - [622, 8955.81] + - [626, 8955.81] - - [256, 4864, 1, 8976] - - [607, 8979.35] + - [611, 8979.35] - - [256, 15104, 1, 8976] - - [610, 10007.0] + - [614, 10007.0] - - [1024, 256, 1, 9984] - - [620, 9110.43] + - [624, 9110.43] - - [256, 1280, 1, 8976] - - [606, 5944.34] + - [610, 5944.34] - - [1024, 256, 1, 1024] - - [622, 7005.1] + - [626, 7005.1] - - [1024, 256, 1, 9728] - - [622, 9066.19] + - [626, 9066.19] - - [1024, 256, 1, 10496] - - [620, 9118.05] + - [624, 9118.05] - - [256, 11776, 1, 8976] - - [617, 9911.64] + - [621, 9911.64] - - [256, 12544, 1, 8976] - - [610, 9235.25] + - [614, 9235.25] - - [1024, 256, 1, 17152] - - [620, 9152.21] + - [624, 9152.21] - - [1024, 256, 1, 11520] - - [622, 9146.77] + - [626, 9146.77] - - [1024, 256, 1, 21504] - - [622, 9207.42] + - [626, 9207.42] - - [256, 17152, 1, 8976] - - [609, 9654.71] + - [613, 9654.71] - - [1024, 256, 1, 17408] - - [620, 9181.17] + - [624, 9181.17] - - [256, 15872, 1, 8976] - - [618, 10086.4] + - [622, 10086.4] - - [256, 18688, 1, 8976] - - [611, 9612.47] + - [615, 9612.47] - - [256, 5888, 1, 8976] - - [615, 9988.33] + - [619, 9988.33] - - [512, 2048, 1, 256] - - [605, 7678.36] + - [609, 7678.36] - - [1024, 256, 1, 7680] - - [623, 9032.96] + - [627, 9032.96] - - [1024, 256, 1, 1280] - - [625, 7767.23] + - [629, 7767.23] - - [256, 14848, 1, 8976] - - [611, 9852.66] + - [615, 9852.66] - - [256, 9984, 1, 8976] - - [617, 9908.87] + - [621, 9908.87] - - [256, 20480, 1, 8976] - - [615, 10337.1] + - [619, 10337.1] - - [1024, 256, 1, 8192] - - [622, 9044.32] + - [626, 9044.32] - - [1024, 256, 1, 19712] - - [621, 9184.18] + - [625, 9184.18] - - [256, 13568, 1, 8976] - - [611, 9927.82] + - [615, 9927.82] - - [256, 13312, 1, 8976] - - [610, 9757.91] + - [614, 9757.91] - - [256, 2816, 1, 8976] - - [610, 9191.43] + - [614, 9191.43] - - [1024, 256, 1, 2304] - - [621, 8444.91] + - [625, 8444.91] - - [256, 21248, 1, 8976] - - [611, 10127.5] + - [615, 10127.5] - - [256, 16128, 1, 8976] - - [619, 10238.4] + - [623, 10238.4] - - [256, 512, 36, 98] - - [642, 7994.85] + - [646, 7994.85] - - [64, 192, 36, 25088] - - [711, 8613.89] + - [715, 8613.89] - - [128, 128, 64, 25] - - [641, 2540.15] + - [645, 2540.15] - - [256, 256, 64, 56] - - [642, 6924.56] + - [646, 6924.56] - - [512, 486, 36, 800] - - [649, 8994.84] + - [653, 8994.84] - - [512, 512, 36, 1568] - - [660, 9872.38] + - [664, 9872.38] - - [64, 192, 64, 3200] - - [705, 9295.89] + - [709, 9295.89] - - [256, 384, 36, 4096] - - [705, 9334.61] + - [709, 9334.61] - - [128, 256, 64, 32] - - [644, 4279.9] + - [648, 4279.9] - - [64, 128, 64, 23104] - - [711, 10103.1] + - [715, 10103.1] - - [128, 256, 64, 9] - - [635, 1709.63] + - [639, 1709.63] - - [256, 512, 36, 784] - - [645, 9520.73] + - [649, 9520.73] - - [256, 324, 36, 32] - - [683, 4473.38] + - [687, 4473.38] - - [512, 512, 36, 33] - - [654, 5925.17] + - [658, 5925.17] - - [16, 32, 36, 5760] - - [658, 1448.8] + - [662, 1448.8] - - [192, 384, 64, 128] - - [705, 8618.43] + - [709, 8618.43] - - [512, 512, 64, 72] - - [661, 8260.12] + - [665, 8260.12] - - [128, 128, 64, 1600] - - [634, 9008.38] + - [638, 9008.38] - - [512, 512, 36, 128] - - [705, 8871.62] + - [709, 8871.62] - - [192, 384, 64, 2304] - - [634, 9657.16] + - [638, 9657.16] - - [384, 256, 64, 450] - - [670, 9538.93] + - [674, 9538.93] - - [3, 64, 36, 6272] - - [658, 509.784] + - [662, 509.784] - - [3, 64, 64, 2888] - - [687, 708.621] + - [691, 708.621] - - [384, 256, 64, 2304] - - [670, 10287.5] + - [674, 10287.5] - - [512, 512, 64, 144] - - [705, 9226.7] + - [709, 9226.7] - - [256, 256, 36, 6272] - - [645, 9607.28] + - [649, 9607.28] - - [80, 192, 64, 4608] - - [706, 7347.93] + - [710, 7347.93] - - [64, 64, 36, 3136] - - [693, 5959.05] + - [697, 5959.05] - - [256, 384, 64, 2304] - - [670, 10283.4] + - [674, 10283.4] - - [512, 512, 36, 66] - - [654, 7618.08] + - [658, 7618.08] - - [128, 256, 64, 800] - - [680, 9611.15] + - [684, 9611.15] - - [64, 128, 36, 30] - - [636, 1242.61] + - [640, 1242.61] - - [192, 256, 36, 512] - - [705, 8657.97] + - [709, 8657.97] - - [256, 512, 64, 200] - - [705, 9153.87] + - [709, 9153.87] - - [256, 512, 64, 25] - - [683, 5349.88] + - [687, 5349.88] - - [3, 64, 64, 46208] - - [686, 808.562] + - [690, 808.562] - - [128, 256, 36, 1568] - - [678, 8528.62] + - [682, 8528.62] - - [64, 128, 64, 11552] - - [711, 9997.0] + - [715, 9997.0] - - [128, 192, 64, 946] - - [705, 9198.38] + - [709, 9198.38] - - [64, 192, 64, 12800] - - [666, 9000.66] + - [670, 9000.66] - - [224, 224, 64, 128] - - [643, 6312.07] + - [647, 6312.07] - - [128, 256, 64, 288] - - [705, 8697.87] + - [709, 8697.87] - - [64, 64, 64, 826] - - [648, 6650.21] + - [652, 6650.21] - - [256, 384, 64, 1152] - - [680, 10106.8] + - [684, 10106.8] - - [3, 64, 64, 92416] - - [686, 812.031] + - [690, 812.031] - - [32, 32, 36, 43808] - - [627, 2813.09] + - [631, 2813.09] - - [160, 320, 64, 288] - - [637, 8090.86] + - [641, 8090.86] - - [1, 16, 36, 23040] - - [674, 42.6667] + - [678, 42.6667] - - [128, 256, 36, 128] - - [652, 6049.48] + - [656, 6049.48] - - [128, 128, 64, 3360] - - [705, 9199.96] + - [709, 9199.96] - - [128, 128, 64, 420] - - [705, 8131.5] + - [709, 8131.5] - - [64, 128, 64, 361] - - [642, 6937.98] + - [646, 6937.98] - - [512, 512, 36, 16] - - [698, 3797.66] + - [702, 3797.66] - - [384, 256, 36, 800] - - [639, 9151.65] + - [643, 9151.65] - - [192, 384, 36, 4096] - - [639, 8867.57] + - [643, 8867.57] - - [64, 64, 64, 1600] - - [691, 7931.74] + - [695, 7931.74] - - [256, 384, 64, 576] - - [671, 9745.8] + - [675, 9745.8] - - [512, 512, 64, 14] - - [654, 3638.18] + - [658, 3638.18] - - [512, 512, 36, 8] - - [629, 2279.51] + - [633, 2279.51] - - [512, 486, 64, 128] - - [645, 8337.83] + - [649, 8337.83] - - [1, 16, 64, 640] - - [679, 49.9512] + - [683, 49.9512] - - [64, 96, 64, 288] - - [704, 5707.97] + - [708, 5707.97] - - [96, 96, 36, 1568] - - [673, 6866.75] + - [677, 6866.75] - - [256, 256, 36, 128] - - [677, 7703.82] + - [681, 7703.82] - - [64, 128, 36, 53824] - - [665, 6331.31] + - [669, 6331.31] - - [256, 256, 36, 32] - - [661, 4648.86] + - [665, 4648.86] - - [192, 256, 64, 288] - - [705, 8987.79] + - [709, 8987.79] - - [256, 256, 36, 16] - - [675, 2912.71] + - [679, 2912.71] - - [128, 256, 36, 3200] - - [678, 8680.27] + - [682, 8680.27] - - [160, 320, 64, 512] - - [637, 8449.44] + - [641, 8449.44] - - [128, 160, 36, 512] - - [648, 7214.97] + - [652, 7214.97] - - [96, 96, 36, 2592] - - [643, 7104.79] + - [647, 7104.79] - - [64, 96, 64, 800] - - [673, 7268.32] + - [677, 7268.32] - - [147, 64, 36, 18816] - - [689, 7116.26] + - [693, 7116.26] - - [160, 320, 36, 512] - - [643, 7874.82] + - [647, 7874.82] - - [256, 512, 36, 4] - - [682, 1034.78] + - [686, 1034.78] - - [96, 128, 64, 946] - - [665, 7901.07] + - [669, 7901.07] - - [256, 324, 64, 1568] - - [670, 8589.53] + - [674, 8589.53] - - [128, 128, 64, 50] - - [661, 4070.56] + - [665, 4070.56] - - [35, 96, 36, 8960] - - [655, 4207.3] + - [659, 4207.3] - - [32, 64, 36, 43808] - - [696, 4390.81] + - [700, 4390.81] - - [160, 224, 36, 128] - - [643, 5446.92] + - [647, 5446.92] - - [64, 64, 64, 81] - - [668, 2391.18] + - [672, 2391.18] - - [256, 256, 36, 3200] - - [634, 9559.55] + - [638, 9559.55] - - [256, 256, 36, 210] - - [645, 8414.61] + - [649, 8414.61] - - [192, 384, 64, 576] - - [705, 9468.75] + - [709, 9468.75] - - [512, 512, 64, 800] - - [680, 10096.4] + - [684, 10096.4] - - [512, 24, 36, 800] - - [631, 4761.77] + - [635, 4761.77] - - [64, 64, 64, 13216] - - [692, 8491.41] + - [696, 8491.41] - - [192, 224, 64, 1152] - - [648, 8769.06] + - [652, 8769.06] - - [256, 256, 64, 1152] - - [670, 9988.09] + - [674, 9988.09] - - [512, 486, 64, 512] - - [680, 9254.67] + - [684, 9254.67] - - [128, 128, 36, 784] - - [643, 7468.06] + - [647, 7468.06] - - [256, 512, 64, 1600] - - [667, 10232.5] + - [671, 10232.5] - - [512, 512, 64, 9] - - [661, 2599.78] + - [665, 2599.78] - - [96, 128, 64, 288] - - [673, 6599.43] + - [677, 6599.43] - - [64, 96, 36, 512] - - [673, 5073.75] + - [677, 5073.75] - - [256, 512, 36, 1568] - - [705, 9637.81] + - [709, 9637.81] - - [128, 128, 64, 400] - - [705, 8192.0] + - [709, 8192.0] - - [128, 128, 64, 800] - - [705, 8716.34] + - [709, 8716.34] - - [96, 128, 36, 512] - - [693, 6756.93] + - [697, 6756.93] - - [16, 32, 36, 360] - - [656, 754.036] + - [660, 754.036] - - [128, 256, 64, 3200] - - [670, 10222.5] + - [674, 10222.5] - - [96, 128, 64, 800] - - [673, 7967.9] + - [677, 7967.9] - - [256, 512, 64, 4] - - [635, 1097.99] + - [639, 1097.99] - - [256, 256, 64, 450] - - [680, 9347.45] + - [684, 9347.45] - - [64, 64, 64, 3200] - - [691, 8518.08] + - [695, 8518.08] - - [192, 224, 64, 128] - - [651, 7035.17] + - [655, 7035.17] - - [128, 128, 64, 288] - - [705, 7751.28] + - [709, 7751.28] - - [256, 256, 64, 72] - - [661, 7489.83] + - [665, 7489.83] - - [96, 208, 36, 512] - - [673, 6939.11] + - [677, 6939.11] - - [128, 256, 36, 3136] - - [648, 8669.33] + - [652, 8669.33] - - [64, 64, 36, 3520] - - [643, 6007.47] + - [647, 6007.47] - - [64, 128, 36, 1568] - - [706, 6897.7] + - [710, 6897.7] - - [160, 320, 64, 242] - - [632, 7873.17] + - [636, 7873.17] - - [192, 192, 36, 512] - - [643, 7707.32] + - [647, 7707.32] - - [512, 512, 36, 512] - - [705, 9582.42] + - [709, 9582.42] - - [1, 16, 64, 10240] - - [657, 71.3511] + - [661, 71.3511] - - [128, 128, 36, 512] - - [643, 7149.38] + - [647, 7149.38] - - [512, 512, 36, 256] - - [634, 9384.4] + - [638, 9384.4] - - [512, 512, 36, 1024] - - [628, 9777.89] + - [632, 9777.89] - - [96, 208, 64, 1152] - - [706, 7850.9] + - [710, 7850.9] - - [128, 192, 64, 3200] - - [634, 9490.82] + - [638, 9490.82] - - [256, 256, 36, 4096] - - [639, 9585.46] + - [643, 9585.46] - - [160, 160, 64, 288] - - [673, 7299.8] + - [677, 7299.8] - - [256, 256, 64, 896] - - [670, 9850.33] + - [674, 9850.33] - - [128, 256, 64, 242] - - [705, 8391.38] + - [709, 8391.38] - - [128, 128, 36, 440] - - [648, 6274.72] + - [652, 6274.72] - - [96, 128, 36, 1568] - - [693, 7875.03] + - [697, 7875.03] - - [192, 384, 36, 1024] - - [639, 8715.72] + - [643, 8715.72] - - [64, 96, 36, 10368] - - [710, 7478.59] + - [714, 7478.59] - - [128, 256, 64, 100] - - [654, 7084.97] + - [658, 7084.97] - - [112, 224, 36, 2048] - - [647, 7555.92] + - [651, 7555.92] - - [384, 256, 64, 1152] - - [670, 10102.3] + - [674, 10102.3] - - [192, 384, 36, 128] - - [705, 7543.04] + - [709, 7543.04] - - [128, 128, 36, 7040] - - [678, 7600.6] + - [682, 7600.6] - - [128, 256, 64, 1568] - - [670, 10005.9] + - [674, 10005.9] - - [128, 128, 36, 1568] - - [662, 7848.3] + - [666, 7848.3] - - [128, 256, 64, 72] - - [685, 6553.6] + - [689, 6553.6] - - [256, 256, 36, 12544] - - [699, 9365.04] + - [703, 9365.04] - - [256, 256, 36, 105] - - [661, 7286.06] + - [665, 7286.06] - - [128, 256, 36, 392] - - [648, 7625.69] + - [652, 7625.69] - - [64, 64, 64, 5408] - - [691, 8882.67] + - [695, 8882.67] - - [3, 64, 36, 25088] - - [658, 528.942] + - [662, 528.942] - - [384, 256, 36, 1024] - - [705, 9182.75] + - [709, 9182.75] - - [35, 96, 36, 13440] - - [712, 4110.29] + - [716, 4110.29] - - [128, 256, 64, 1152] - - [670, 9804.87] + - [674, 9804.87] - - [256, 324, 64, 32] - - [683, 5043.63] + - [687, 5043.63] - - [160, 224, 64, 128] - - [697, 6046.15] + - [701, 6046.15] - - [192, 224, 36, 2592] - - [695, 8878.68] + - [699, 8878.68] - - [96, 96, 64, 1152] - - [673, 8035.45] + - [677, 8035.45] - - [32, 64, 36, 90] - - [630, 964.465] + - [634, 964.465] - - [64, 128, 64, 2888] - - [645, 9047.23] + - [649, 9047.23] - - [256, 384, 36, 800] - - [705, 9154.02] + - [709, 9154.02] - - [512, 512, 64, 4] - - [702, 1233.62] + - [706, 1233.62] - - [192, 320, 36, 128] - - [642, 7388.19] + - [646, 7388.19] - - [64, 128, 36, 480] - - [706, 5653.27] + - [710, 5653.27] - - [192, 384, 64, 242] - - [705, 9079.99] + - [709, 9079.99] - - [256, 486, 64, 32] - - [698, 5909.18] + - [702, 5909.18] - - [147, 64, 64, 9702] - - [707, 7319.69] + - [711, 7319.69] - - [512, 512, 64, 64] - - [641, 8179.02] + - [645, 8179.02] - - [64, 192, 64, 3698] - - [634, 9287.89] + - [638, 9287.89] - - [73, 192, 64, 10439] - - [665, 6668.02] + - [669, 6668.02] - - [1, 16, 36, 1440] - - [681, 33.4452] + - [685, 33.4452] - - [128, 256, 36, 512] - - [648, 7989.15] + - [652, 7989.15] - - [512, 512, 64, 576] - - [680, 9951.89] + - [684, 9951.89] - - [64, 64, 36, 12544] - - [696, 5872.77] + - [700, 5872.77] - - [128, 128, 36, 880] - - [693, 7597.26] + - [697, 7597.26] - - [192, 224, 36, 128] - - [651, 6451.2] + - [655, 6451.2] - - [64, 64, 64, 800] - - [691, 6916.73] + - [695, 6916.73] - - [64, 128, 36, 12544] - - [669, 6395.88] + - [673, 6395.88] - - [64, 64, 36, 1568] - - [643, 5536.66] + - [647, 5536.66] - - [160, 160, 36, 512] - - [643, 7345.26] + - [647, 7345.26] - - [512, 24, 64, 512] - - [633, 5242.88] + - [637, 5242.88] - - [3, 64, 36, 3136] - - [658, 475.352] + - [662, 475.352] - - [256, 256, 64, 9] - - [683, 2106.51] + - [687, 2106.51] - - [3, 64, 64, 11552] - - [686, 785.127] + - [690, 785.127] - - [128, 256, 36, 12544] - - [701, 8792.13] + - [705, 8792.13] - - [128, 128, 36, 3136] - - [662, 8098.46] + - [666, 8098.46] - - [256, 512, 36, 3136] - - [645, 9694.39] + - [649, 9694.39] - - [64, 64, 36, 196] - - [659, 2757.76] + - [663, 2757.76] - - [144, 288, 36, 512] - - [693, 7077.89] + - [697, 7077.89] - - [256, 24, 64, 32] - - [672, 1483.83] + - [676, 1483.83] - - [384, 384, 36, 800] - - [634, 9246.5] + - [638, 9246.5] - - [512, 512, 64, 1600] - - [680, 10277.3] + - [684, 10277.3] - - [112, 224, 36, 512] - - [648, 6744.78] + - [652, 6744.78] - - [128, 128, 36, 49] - - [654, 2716.29] + - [658, 2716.29] - - [512, 512, 36, 4] - - [682, 1156.52] + - [686, 1156.52] - - [35, 96, 64, 4235] - - [643, 4631.28] + - [647, 4631.28] - - [192, 384, 64, 450] - - [634, 9372.2] + - [638, 9372.2] - - [256, 256, 36, 1024] - - [705, 9346.64] + - [709, 9346.64] - - [112, 224, 64, 1152] - - [648, 7523.95] + - [652, 7523.95] - - [256, 512, 64, 400] - - [667, 9597.95] + - [671, 9597.95] - - [149, 32, 36, 19072] - - [712, 5811.8] + - [716, 5811.8] - - [128, 256, 36, 6272] - - [648, 8754.68] + - [652, 8754.68] - - [128, 192, 36, 1568] - - [673, 8195.1] + - [677, 8195.1] - - [256, 256, 36, 512] - - [705, 9074.22] + - [709, 9074.22] - - [256, 256, 64, 112] - - [705, 8305.55] + - [709, 8305.55] - - [512, 512, 64, 18] - - [698, 4324.02] + - [702, 4324.02] - - [256, 256, 64, 18] - - [661, 3547.81] + - [665, 3547.81] - - [256, 256, 64, 1568] - - [670, 10141.7] + - [674, 10141.7] - - [64, 96, 36, 1568] - - [691, 6805.66] + - [695, 6805.66] - - [384, 256, 36, 4096] - - [705, 9311.1] + - [709, 9311.1] - - [256, 512, 64, 800] - - [680, 9998.35] + - [684, 9998.35] - - [256, 384, 36, 2048] - - [705, 9285.34] + - [709, 9285.34] - - [3, 64, 36, 200704] - - [687, 547.375] + - [691, 547.375] - - [384, 384, 64, 2304] - - [628, 9901.68] + - [632, 9901.68] - - [160, 320, 64, 128] - - [664, 7113.81] + - [668, 7113.81] - - [512, 512, 36, 528] - - [634, 9567.65] + - [638, 9567.65] - - [160, 320, 36, 128] - - [665, 6411.13] + - [669, 6411.13] - - [96, 96, 64, 800] - - [673, 7690.01] + - [677, 7690.01] - - [256, 512, 36, 49] - - [661, 6721.25] + - [665, 6721.25] - - [384, 384, 64, 450] - - [634, 9523.53] + - [638, 9523.53] - - [3, 64, 64, 23104] - - [686, 801.621] + - [690, 801.621] - - [256, 256, 64, 3200] - - [670, 10300.4] + - [674, 10300.4] - - [128, 192, 36, 512] - - [648, 7499.75] + - [652, 7499.75] - - [192, 192, 64, 288] - - [705, 8774.24] + - [709, 8774.24] - - [96, 208, 64, 242] - - [665, 5901.99] + - [669, 5901.99] - - [256, 16, 36, 3200] - - [694, 3807.77] + - [698, 3807.77] - - [512, 512, 64, 8] - - [672, 2379.75] + - [676, 2379.75] - - [64, 128, 64, 5776] - - [645, 9332.74] + - [649, 9332.74] - - [512, 512, 64, 288] - - [634, 9521.99] + - [638, 9521.99] - - [256, 16, 36, 32] - - [690, 766.005] + - [694, 766.005] - - [128, 192, 64, 288] - - [705, 8527.58] + - [709, 8527.58] - - [32, 64, 64, 640] - - [673, 4660.34] + - [677, 4660.34] - - [64, 64, 36, 392] - - [673, 3686.4] + - [677, 3686.4] - - [384, 384, 36, 1024] - - [639, 9282.48] + - [643, 9282.48] - - [64, 64, 36, 11552] - - [703, 5904.78] + - [707, 5904.78] - - [96, 128, 36, 6272] - - [693, 8350.99] + - [697, 8350.99] - - [128, 256, 36, 16] - - [675, 2144.81] + - [679, 2144.81] - - [256, 256, 64, 288] - - [705, 9140.13] + - [709, 9140.13] - - [64, 64, 64, 1652] - - [691, 7766.53] + - [695, 7766.53] - - [256, 384, 36, 1024] - - [639, 9203.27] + - [643, 9203.27] - - [96, 128, 64, 3200] - - [708, 8866.2] + - [712, 8866.2] - - [256, 324, 36, 3200] - - [647, 8194.25] + - [651, 8194.25] - - [128, 192, 64, 800] - - [705, 9198.03] + - [709, 9198.03] - - [64, 128, 64, 10] - - [646, 851.117] + - [650, 851.117] - - [96, 208, 64, 288] - - [673, 6667.58] + - [677, 6667.58] - - [64, 96, 36, 2592] - - [655, 7216.88] + - [659, 7216.88] - - [64, 128, 64, 160] - - [684, 5190.97] + - [688, 5190.97] - - [192, 384, 64, 512] - - [634, 9446.04] + - [638, 9446.04] - - [64, 64, 36, 6272] - - [643, 6212.01] + - [647, 6212.01] - - [512, 24, 36, 288] - - [640, 3922.47] + - [644, 3922.47] - - [128, 128, 64, 1568] - - [634, 9037.86] + - [638, 9037.86] - - [112, 224, 64, 242] - - [704, 6399.26] + - [708, 6399.26] - - [128, 256, 64, 1600] - - [670, 10010.3] + - [674, 10010.3] - - [32, 32, 64, 20000] - - [638, 4378.41] + - [642, 4378.41] - - [160, 192, 64, 288] - - [665, 7803.63] + - [669, 7803.63] - - [512, 24, 64, 128] - - [626, 3733.8] + - [630, 3733.8] - - [512, 512, 36, 32] - - [661, 5935.34] + - [665, 5935.34] - - [3, 64, 36, 100352] - - [658, 542.783] + - [662, 542.783] - - [3, 64, 64, 1444] - - [687, 674.159] + - [691, 674.159] - - [512, 512, 36, 3136] - - [628, 9921.1] + - [632, 9921.1] - - [128, 256, 64, 6400] - - [688, 10349.3] + - [692, 10349.3] - - [256, 256, 36, 2048] - - [705, 9518.99] + - [709, 9518.99] - - [128, 160, 64, 288] - - [648, 7549.75] + - [652, 7549.75] - - [256, 256, 64, 6400] - - [670, 10392.6] + - [674, 10392.6] - - [32, 64, 64, 20000] - - [696, 6493.86] + - [700, 6493.86] - - [256, 256, 36, 1680] - - [645, 9513.29] + - [649, 9513.29] - - [128, 128, 64, 210] - - [705, 7094.1] + - [709, 7094.1] - - [192, 384, 36, 2048] - - [634, 8818.65] + - [638, 8818.65] - - [256, 256, 64, 144] - - [705, 8608.61] + - [709, 8608.61] - - [384, 384, 36, 4096] - - [639, 9356.94] + - [643, 9356.94] - - [160, 320, 64, 1152] - - [665, 8749.48] + - [669, 8749.48] - - [384, 256, 36, 2048] - - [705, 9279.63] + - [709, 9279.63] - - [256, 512, 36, 392] - - [705, 9252.14] + - [709, 9252.14] - - [256, 512, 64, 50] - - [661, 7511.29] + - [665, 7511.29] - - [73, 192, 36, 23360] - - [709, 5802.93] + - [713, 5802.93] - - [3, 64, 36, 50176] - - [658, 542.037] + - [662, 542.037] - - [384, 384, 36, 2048] - - [634, 9325.8] + - [638, 9325.8] - - [256, 384, 64, 450] - - [680, 9528.66] + - [684, 9528.66] - - [192, 320, 64, 128] - - [639, 8399.81] + - [643, 8399.81] - - [128, 256, 36, 32] - - [654, 3276.8] + - [658, 3276.8] - - [160, 192, 36, 512] - - [693, 7752.34] + - [697, 7752.34] - - [512, 512, 64, 256] - - [645, 9473.64] + - [649, 9473.64] - - [256, 512, 64, 32] - - [683, 6391.32] + - [687, 6391.32] - - [384, 384, 64, 576] - - [634, 9614.79] + - [638, 9614.79] - - [64, 64, 64, 648] - - [691, 6282.15] + - [695, 6282.15] - - [512, 486, 36, 288] - - [705, 8624.93] + - [709, 8624.93] - - [32, 64, 36, 1440] - - [643, 3961.5] + - [647, 3961.5] - - [144, 288, 64, 242] - - [665, 6347.02] + - [669, 6347.02] - - [384, 256, 64, 576] - - [670, 9775.24] + - [674, 9775.24] - - [512, 512, 36, 64] - - [641, 7791.28] + - [645, 7791.28] - - [448, 384, 64, 128] - - [634, 9132.23] + - [638, 9132.23] - - [64, 128, 64, 722] - - [684, 8047.11] + - [688, 8047.11] - - [144, 288, 64, 288] - - [693, 6859.4] + - [697, 6859.4] - - [512, 512, 64, 224] - - [705, 9427.29] + - [709, 9427.29] - - [112, 224, 64, 288] - - [704, 6736.92] + - [708, 6736.92] - - [384, 384, 64, 1152] - - [628, 9820.46] + - [632, 9820.46] - - [448, 384, 36, 128] - - [705, 8761.31] + - [709, 8761.31] - - [64, 64, 64, 100] - - [651, 2708.1] + - [655, 2708.1] - - [256, 486, 36, 128] - - [677, 7640.04] + - [681, 7640.04] - - [64, 96, 64, 4608] - - [706, 8351.49] + - [710, 8351.49] - - [16, 32, 64, 160] - - [630, 736.36] + - [634, 736.36] - - [64, 192, 36, 6272] - - [706, 8041.19] + - [710, 8041.19] - - [64, 64, 64, 200] - - [659, 3924.31] + - [663, 3924.31] - - [256, 256, 36, 800] - - [705, 9299.55] + - [709, 9299.55] - - [64, 128, 36, 6272] - - [703, 6816.36] + - [707, 6816.36] - - [32, 64, 64, 40] - - [650, 885.622] + - [654, 885.622] - - [256, 16, 64, 32] - - [700, 1205.26] + - [704, 1205.26] - - [192, 384, 36, 800] - - [639, 8673.88] + - [643, 8673.88] - - [128, 128, 36, 3200] - - [673, 8538.89] + - [677, 8538.89] - - [256, 256, 36, 256] - - [645, 8454.36] + - [649, 8454.36] - - [192, 384, 64, 1152] - - [634, 9589.01] + - [638, 9589.01] - - [128, 256, 64, 200] - - [644, 8141.12] + - [648, 8141.12] - - [64, 96, 64, 1152] - - [673, 7620.88] + - [677, 7620.88] - - [128, 128, 36, 392] - - [648, 6175.51] + - [652, 6175.51] - - [80, 192, 36, 10368] - - [696, 6497.16] + - [700, 6497.16] - - [224, 224, 36, 128] - - [706, 5826.89] + - [710, 5826.89] - - [512, 512, 64, 28] - - [661, 5728.81] + - [665, 5728.81] - - [256, 16, 64, 1568] - - [676, 4637.2] + - [680, 4637.2] - - [144, 288, 64, 1152] - - [693, 7784.24] + - [697, 7784.24] - - [256, 256, 64, 576] - - [670, 9596.12] + - [674, 9596.12] - - [64, 128, 36, 784] - - [706, 6058.99] + - [710, 6058.99] - - [256, 24, 36, 128] - - [640, 2239.84] + - [644, 2239.84] - - [256, 256, 64, 2304] - - [670, 10225.7] + - [674, 10225.7] - - [192, 384, 36, 512] - - [705, 8549.03] + - [709, 8549.03] - - [16, 32, 64, 2560] - - [658, 2153.13] + - [662, 2153.13] - - [256, 512, 36, 32] - - [683, 5702.23] + - [687, 5702.23] - - [512, 512, 64, 128] - - [705, 9084.11] + - [709, 9084.11] - - [128, 128, 64, 200] - - [642, 6971.91] + - [646, 6971.91] - - [512, 512, 64, 32] - - [654, 6248.5] + - [658, 6248.5] - - [128, 256, 36, 196] - - [654, 6628.76] + - [658, 6628.76] - - [8, 384, 64, 6600] - - [686, 2733.89] + - [690, 2733.89] - - [149, 32, 64, 8195] - - [648, 6050.91] + - [652, 6050.91] - - [35, 96, 64, 6160] - - [693, 4689.35] + - [697, 4689.35] - - [64, 64, 36, 1760] - - [643, 5622.24] + - [647, 5622.24] - - [196, 528, 32, 32] - - [726, 4088.41] + - [730, 4088.41] - - [5329, 64, 32, 80] - - [719, 8331.14] + - [723, 8331.14] - - [64, 2880, 1, 320] - - [770, 4362.6] + - [774, 4362.6] - - [49, 832, 32, 256] - - [733, 5618.63] + - [737, 5618.63] - - [196, 512, 32, 24] - - [720, 3621.73] + - [724, 3621.73] - - [289, 1120, 1, 160] - - [716, 3302.86] + - [720, 3302.86] - - [1225, 192, 32, 32] - - [724, 6194.57] + - [728, 6194.57] - - [64, 2048, 32, 384] - - [747, 9541.54] + - [751, 9541.54] - - [1001, 1536, 1, 32] - - [718, 3575.67] + - [722, 3575.67] - - [289, 1792, 1, 320] - - [741, 5140.33] + - [745, 5140.33] - - [1001, 1024, 1, 32] - - [713, 2733.4] + - [717, 2733.4] - - [196, 480, 32, 64] - - [774, 5070.42] + - [778, 5070.42] - - [64, 1728, 1, 320] - - [771, 3205.57] + - [775, 3205.57] - - [49, 832, 32, 160] - - [775, 4988.82] + - [779, 4988.82] - - [49, 832, 32, 384] - - [733, 5901.95] + - [737, 5901.95] - - [289, 896, 1, 192] - - [759, 3452.59] + - [763, 3452.59] - - [289, 1024, 32, 384] - - [778, 8902.42] + - [782, 8902.42] - - [784, 192, 32, 96] - - [789, 7853.63] + - [793, 7853.63] - - [50176, 256, 1, 128] - - [752, 9041.83] + - [756, 9041.83] - - [289, 1024, 32, 256] - - [787, 8660.72] + - [791, 8660.72] - - [289, 1024, 32, 192] - - [776, 8433.35] + - [780, 8433.35] - - [12544, 512, 1, 256] - - [736, 9187.34] + - [740, 9187.34] - - [1225, 1728, 1, 192] - - [740, 7720.85] + - [744, 7720.85] - - [196, 480, 32, 96] - - [785, 5662.5] + - [789, 5662.5] - - [196, 512, 32, 144] - - [779, 6531.38] + - [783, 6531.38] - - [784, 400, 1, 32] - - [714, 1280.0] + - [718, 1280.0] - - [289, 768, 32, 128] - - [780, 7913.61] + - [784, 7913.61] - - [5329, 576, 1, 96] - - [723, 7563.46] + - [727, 7563.46] - - [49, 1200, 1, 128] - - [767, 1011.61] + - [771, 1011.61] - - [64, 1536, 32, 256] - - [781, 9159.54] + - [785, 9159.54] - - [289, 2592, 1, 384] - - [749, 6002.71] + - [753, 6002.71] - - [196, 528, 32, 128] - - [784, 5987.1] + - [788, 5987.1] - - [64, 2048, 32, 448] - - [747, 9669.87] + - [751, 9669.87] - - [5329, 448, 1, 64] - - [719, 6201.02] + - [723, 6201.02] - - [784, 256, 32, 64] - - [721, 7623.18] + - [725, 7623.18] - - [784, 192, 32, 32] - - [726, 5874.26] + - [730, 5874.26] - - [21609, 288, 1, 32] - - [739, 5296.5] + - [743, 5296.5] - - [784, 256, 32, 32] - - [717, 6235.46] + - [721, 6235.46] - - [5041, 720, 1, 192] - - [735, 8140.98] + - [739, 8140.98] - - [289, 2016, 1, 256] - - [732, 5404.05] + - [736, 5404.05] - - [196, 512, 32, 128] - - [777, 6366.82] + - [781, 6366.82] - - [289, 768, 32, 160] - - [779, 8253.88] + - [783, 8253.88] - - [64, 1536, 32, 384] - - [750, 9508.5] + - [754, 9508.5] - - [64, 1280, 32, 320] - - [750, 9070.73] + - [754, 9070.73] - - [289, 896, 1, 128] - - [760, 2917.68] + - [764, 2917.68] - - [289, 3456, 1, 384] - - [740, 7274.91] + - [744, 7274.91] - - [196, 800, 1, 64] - - [762, 1393.78] + - [766, 1393.78] - - [64, 1280, 32, 384] - - [746, 9225.01] + - [750, 9225.01] - - [64, 1344, 1, 512] - - [765, 3041.45] + - [769, 3041.45] - - [1001, 4096, 1, 512] - - [746, 9391.77] + - [750, 9391.77] - - [1225, 192, 32, 64] - - [719, 7729.29] + - [723, 7729.29] - - [64, 1152, 1, 384] - - [769, 2440.65] + - [773, 2440.65] - - [729, 1600, 1, 192] - - [731, 6827.71] + - [735, 6827.71] - - [289, 1344, 1, 192] - - [729, 4439.04] + - [733, 4439.04] - - [784, 192, 32, 16] - - [756, 3663.04] + - [760, 3663.04] - - [3136, 1024, 1, 2048] - - [738, 9071.77] + - [742, 9071.77] - - [64, 1152, 1, 448] - - [766, 2564.45] + - [770, 2564.45] - - [49, 832, 32, 128] - - [729, 4733.16] + - [733, 4733.16] - - [784, 256, 32, 128] - - [742, 8471.6] + - [746, 8471.6] - - [49, 800, 1, 128] - - [764, 633.535] + - [768, 633.535] - - [196, 512, 32, 32] - - [726, 4354.26] + - [730, 4354.26] - - [1225, 384, 32, 96] - - [743, 8751.63] + - [747, 8751.63] - - [5041, 576, 1, 96] - - [725, 7067.63] + - [729, 7067.63] - - [49, 832, 32, 48] - - [758, 3316.72] + - [762, 3316.72] - - [5329, 160, 32, 64] - - [782, 8159.84] + - [786, 8159.84] - - [1225, 288, 32, 48] - - [772, 6673.65] + - [776, 6673.65] - - [4096, 9216, 1, 512] - - [754, 10116.9] + - [758, 10116.9] - - [196, 480, 32, 192] - - [783, 6388.46] + - [787, 6388.46] - - [64, 1152, 1, 256] - - [770, 1982.6] + - [774, 1982.6] - - [3136, 1024, 1, 512] - - [738, 8745.57] + - [742, 8745.57] - - [49, 832, 32, 32] - - [757, 2717.87] + - [761, 2717.87] - - [784, 192, 32, 64] - - [721, 7216.32] + - [725, 7216.32] - - [289, 1024, 32, 128] - - [744, 7970.5] + - [748, 7970.5] - - [289, 768, 32, 192] - - [788, 8327.27] + - [792, 8327.27] - - [289, 1120, 1, 192] - - [728, 3716.9] + - [732, 3716.9] - - [196, 512, 32, 112] - - [734, 6252.81] + - [738, 6252.81] - - [1001, 2048, 1, 32] - - [722, 4000.09] + - [726, 4000.09] - - [1225, 288, 32, 64] - - [782, 7208.04] + - [786, 7208.04] - - [196, 600, 1, 64] - - [761, 1093.95] + - [765, 1093.95] - - [1225, 384, 32, 192] - - [743, 9332.66] + - [747, 9332.66] - - [50176, 256, 1, 512] - - [753, 9833.54] + - [757, 9833.54] - - [196, 512, 32, 160] - - [780, 6614.34] + - [784, 6614.34] - - [4096, 4096, 1, 512] - - [751, 10032.2] + - [755, 10032.2] - - [49, 832, 32, 192] - - [729, 5244.53] + - [733, 5244.53] - - [1225, 256, 32, 64] - - [719, 7972.35] + - [723, 7972.35] - - [64, 2048, 32, 320] - - [747, 9404.27] + - [751, 9404.27] - - [196, 480, 32, 16] - - [773, 2724.49] + - [777, 2724.49] - - [1225, 256, 32, 48] - - [721, 7100.38] + - [725, 7100.38] - - [64, 1280, 32, 448] - - [746, 9344.41] + - [750, 9344.41] - - [1225, 1200, 1, 64] - - [715, 5157.89] + - [719, 5157.89] - - [1225, 384, 32, 64] - - [719, 8219.96] + - [723, 8219.96] - - [12544, 512, 1, 1024] - - [738, 9672.72] + - [742, 9672.72] - - [64, 1280, 32, 192] - - [734, 8525.01] + - [738, 8525.01] - - [196, 512, 32, 64] - - [719, 5489.34] + - [723, 5489.34] - - [289, 1792, 1, 256] - - [737, 4831.61] + - [741, 4831.61] - - [196, 528, 32, 256] - - [755, 6453.82] + - [759, 6453.82] - - [64, 2048, 32, 192] - - [742, 8955.81] + - [746, 8955.81] - - [196, 528, 32, 160] - - [783, 6161.15] + - [787, 6161.15] - - [1225, 192, 32, 48] - - [719, 7236.92] + - [723, 7236.92] - - [64, 1728, 1, 192] - - [769, 2480.57] + - [773, 2480.57] - - [1001, 2048, 1, 64] - - [795, 5714.42] + - [799, 5714.42] - - [5329, 64, 128, 80] - - [802, 8835.29] + - [806, 8835.29] - - [64, 1280, 128, 448] - - [800, 10020.5] + - [804, 10020.5] - - [289, 768, 128, 128] - - [803, 8542.71] + - [807, 8542.71] - - [1225, 192, 128, 64] - - [792, 8444.77] + - [796, 8444.77] - - [1225, 288, 128, 48] - - [805, 7244.66] + - [809, 7244.66] - - [289, 768, 128, 192] - - [807, 8794.49] + - [811, 8794.49] - - [289, 768, 128, 160] - - [804, 8705.33] + - [808, 8705.33] - - [64, 2048, 128, 192] - - [798, 9780.26] + - [802, 9780.26] - - [64, 1280, 128, 384] - - [801, 9950.9] + - [805, 9950.9] - - [1225, 256, 128, 48] - - [793, 8273.61] + - [797, 8273.61] - - [1225, 192, 128, 48] - - [793, 8140.32] + - [797, 8140.32] - - [1225, 288, 128, 64] - - [805, 7886.21] + - [809, 7886.21] - - [64, 1280, 128, 320] - - [797, 9894.56] + - [801, 9894.56] - - [1225, 256, 128, 64] - - [798, 8572.51] + - [802, 8572.51] - - [1001, 2048, 1, 128] - - [799, 7289.06] + - [803, 7289.06] - - [1225, 192, 128, 32] - - [794, 7104.57] + - [798, 7104.57] - - [64, 1280, 128, 192] - - [806, 9642.08] + - [810, 9642.08] - - [1001, 1536, 1, 64] - - [796, 5146.56] + - [800, 5146.56] - - [2048, 2048, 1, 1024] - - [810, 9940.21] + - [814, 9940.21] - - [3200, 2048, 1, 1024] - - [809, 9899.24] + - [813, 9899.24] - - [4096, 4096, 1, 1024] - - [811, 10222.2] + - [815, 10222.2] - - [2048, 256, 1, 1024] - - [808, 8452.0] + - [812, 8452.0] - - [257, 4096, 1, 1024] - - [809, 8353.5] + - [813, 8353.5] - - [64, 2048, 64, 192] - - [814, 9434.24] + - [818, 9434.24] - - [1225, 192, 64, 48] - - [817, 7799.38] + - [821, 7799.38] - - [1225, 288, 64, 48] - - [819, 7030.37] + - [823, 7030.37] - - [3136, 64, 64, 64] - - [812, 7941.3] + - [816, 7941.3] - - [1225, 192, 64, 32] - - [818, 6772.91] + - [822, 6772.91] - - [1225, 256, 64, 48] - - [817, 8022.81] + - [821, 8022.81] - - [64, 2048, 64, 384] - - [813, 9859.28] + - [817, 9859.28] - - [64, 1280, 64, 384] - - [813, 9675.44] + - [817, 9675.44] - - [64, 1280, 64, 192] - - [813, 9320.68] + - [817, 9320.68] - - [1225, 192, 64, 64] - - [817, 8180.87] + - [821, 8180.87] - - [3136, 256, 64, 64] - - [814, 8966.88] + - [818, 8966.88] - - [1225, 288, 64, 64] - - [819, 7567.39] + - [823, 7567.39] - - [5329, 64, 64, 80] - - [816, 8634.33] + - [820, 8634.33] - - [64, 1280, 64, 448] - - [813, 9702.62] + - [817, 9702.62] - - [1225, 256, 64, 64] - - [817, 8306.43] + - [821, 8306.43] - - [3136, 64, 64, 256] - - [815, 9431.89] + - [819, 9431.89] - - [64, 1280, 64, 320] - - [813, 9754.2] + - [817, 9754.2] - - [64, 2048, 64, 320] - - [813, 9765.55] + - [817, 9765.55] - - [64, 2048, 64, 448] - - [813, 9948.37] + - [817, 9948.37] + - - [65, 1024, 1, 6400] + - [824, 3556.98] + - - [256, 4096, 1, 6400] + - [825, 10132.4] + - - [1024, 4096, 1, 64] + - [826, 6918.44] + - - [1024, 4096, 1, 6336] + - [827, 10393.9] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml index 9af2a05c1..e740ea571 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Ailk_Bljk_SB.yaml @@ -116061,6 +116061,543 @@ WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 722 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 723 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 + LdsPadA: 0 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 724 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -116189,7 +116726,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 722 + SolutionIndex: 725 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116338,7 +116875,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 723 + SolutionIndex: 726 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116487,7 +117024,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 724 + SolutionIndex: 727 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -116632,7 +117169,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 725 + SolutionIndex: 728 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -116777,7 +117314,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 726 + SolutionIndex: 729 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -116922,7 +117459,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 727 + SolutionIndex: 730 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -117067,7 +117604,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 728 + SolutionIndex: 731 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -117216,7 +117753,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 729 + SolutionIndex: 732 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -117361,7 +117898,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 730 + SolutionIndex: 733 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -117506,7 +118043,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 731 + SolutionIndex: 734 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x008x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -117655,7 +118192,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 732 + SolutionIndex: 735 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT08_02_USFGRO0_VW02_WG08_04_08 SubGroup0: 8 SubGroup1: 4 @@ -117800,7 +118337,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 733 + SolutionIndex: 736 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -117945,7 +118482,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 734 + SolutionIndex: 737 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT02_02_USFGRO0_VW02_WG32_02_04 SubGroup0: 32 SubGroup1: 2 @@ -118090,7 +118627,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 735 + SolutionIndex: 738 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x004x64_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT04_02_USFGRO0_VW02_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -118239,7 +118776,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 736 + SolutionIndex: 739 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x036x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT06_03_USFGRO01_VW01_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -118388,7 +118925,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 737 + SolutionIndex: 740 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT036x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG12_16_01 SubGroup0: 12 SubGroup1: 16 @@ -118537,7 +119074,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 738 + SolutionIndex: 741 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x048x16_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_03_USFGRO01_VW01_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -118686,7 +119223,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 739 + SolutionIndex: 742 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -118835,7 +119372,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 740 + SolutionIndex: 743 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT024x024x32_DTL0_EPS1_GRVW01_LPB00_PGR1_SNLL0_TT03_04_USFGRO01_VW01_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -118984,7 +119521,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 741 + SolutionIndex: 744 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT048x024x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT06_04_USFGRO0_VW02_WG08_06_04 SubGroup0: 8 SubGroup1: 6 @@ -119133,7 +119670,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 742 + SolutionIndex: 745 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -119282,7 +119819,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 743 + SolutionIndex: 746 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119431,7 +119968,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 744 + SolutionIndex: 747 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119580,7 +120117,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 745 + SolutionIndex: 748 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -119729,7 +120266,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 746 + SolutionIndex: 749 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -119878,7 +120415,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 747 + SolutionIndex: 750 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120027,7 +120564,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 748 + SolutionIndex: 751 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120176,7 +120713,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 749 + SolutionIndex: 752 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120325,7 +120862,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 750 + SolutionIndex: 753 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x08_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -120474,7 +121011,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 751 + SolutionIndex: 754 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120623,7 +121160,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 752 + SolutionIndex: 755 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x08_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -120772,7 +121309,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 753 + SolutionIndex: 756 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -120921,7 +121458,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 754 + SolutionIndex: 757 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121070,7 +121607,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 755 + SolutionIndex: 758 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121219,7 +121756,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 756 + SolutionIndex: 759 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121368,7 +121905,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 757 + SolutionIndex: 760 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121517,7 +122054,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 758 + SolutionIndex: 761 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121666,7 +122203,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 759 + SolutionIndex: 762 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -121815,7 +122352,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 760 + SolutionIndex: 763 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x24_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -121964,7 +122501,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 761 + SolutionIndex: 764 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -122113,7 +122650,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 762 + SolutionIndex: 765 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122262,7 +122799,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 763 + SolutionIndex: 766 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -122411,7 +122948,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 764 + SolutionIndex: 767 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122560,7 +123097,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 765 + SolutionIndex: 768 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -122709,7 +123246,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 766 + SolutionIndex: 769 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -122858,7 +123395,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 767 + SolutionIndex: 770 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -123007,7 +123544,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 768 + SolutionIndex: 771 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123156,7 +123693,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 769 + SolutionIndex: 772 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123305,7 +123842,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 770 + SolutionIndex: 773 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123454,7 +123991,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 771 + SolutionIndex: 774 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -123603,7 +124140,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 772 + SolutionIndex: 775 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -123752,7 +124289,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 773 + SolutionIndex: 776 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -123901,7 +124438,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 774 + SolutionIndex: 777 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -124050,7 +124587,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 775 + SolutionIndex: 778 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -124199,7 +124736,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 776 + SolutionIndex: 779 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -124348,7 +124885,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 777 + SolutionIndex: 780 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -124497,7 +125034,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 778 + SolutionIndex: 781 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124646,7 +125183,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 779 + SolutionIndex: 782 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124795,7 +125332,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 780 + SolutionIndex: 783 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -124944,7 +125481,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 781 + SolutionIndex: 784 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -125093,7 +125630,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 782 + SolutionIndex: 785 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -125242,7 +125779,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 783 + SolutionIndex: 786 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125391,7 +125928,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 784 + SolutionIndex: 787 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125540,7 +126077,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 785 + SolutionIndex: 788 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -125689,7 +126226,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 786 + SolutionIndex: 789 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG32_04_02 SubGroup0: 32 SubGroup1: 4 @@ -125838,7 +126375,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 787 + SolutionIndex: 790 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -125987,7 +126524,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 788 + SolutionIndex: 791 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126136,7 +126673,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 789 + SolutionIndex: 792 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126285,7 +126822,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 790 + SolutionIndex: 793 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -126434,7 +126971,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 791 + SolutionIndex: 794 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -126583,7 +127120,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 792 + SolutionIndex: 795 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -126732,7 +127269,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 793 + SolutionIndex: 796 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -126881,7 +127418,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 794 + SolutionIndex: 797 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -127030,7 +127567,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 795 + SolutionIndex: 798 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -127179,7 +127716,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 796 + SolutionIndex: 799 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -127328,7 +127865,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 797 + SolutionIndex: 800 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -127477,7 +128014,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 798 + SolutionIndex: 801 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127626,7 +128163,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 799 + SolutionIndex: 802 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127775,7 +128312,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 800 + SolutionIndex: 803 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -127924,7 +128461,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 801 + SolutionIndex: 804 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -128073,7 +128610,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 802 + SolutionIndex: 805 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128222,7 +128759,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 803 + SolutionIndex: 806 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128371,7 +128908,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 804 + SolutionIndex: 807 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -128520,7 +129057,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 805 + SolutionIndex: 808 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -128669,7 +129206,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 806 + SolutionIndex: 809 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -128818,7 +129355,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 807 + SolutionIndex: 810 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -128967,7 +129504,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 808 + SolutionIndex: 811 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129116,7 +129653,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 809 + SolutionIndex: 812 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129265,7 +129802,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 810 + SolutionIndex: 813 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -129414,7 +129951,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 811 + SolutionIndex: 814 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -129563,7 +130100,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 812 + SolutionIndex: 815 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_08_USFGRO0_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -129712,7 +130249,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 813 + SolutionIndex: 816 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -129861,7 +130398,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 814 + SolutionIndex: 817 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130010,7 +130547,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 815 + SolutionIndex: 818 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130159,7 +130696,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 816 + SolutionIndex: 819 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130308,7 +130845,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 817 + SolutionIndex: 820 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130457,7 +130994,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 818 + SolutionIndex: 821 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO0_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -130606,7 +131143,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 819 + SolutionIndex: 822 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x016x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT04_02_USFGRO0_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -130755,7 +131292,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 820 + SolutionIndex: 823 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -130904,7 +131441,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 821 + SolutionIndex: 824 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x008x32_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO0_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -131049,7 +131586,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 822 + SolutionIndex: 825 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131194,7 +131731,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 823 + SolutionIndex: 826 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131343,7 +131880,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 824 + SolutionIndex: 827 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131492,7 +132029,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 825 + SolutionIndex: 828 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131637,7 +132174,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 826 + SolutionIndex: 829 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131782,7 +132319,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 827 + SolutionIndex: 830 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -131931,7 +132468,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 828 + SolutionIndex: 831 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132076,7 +132613,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 829 + SolutionIndex: 832 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132225,7 +132762,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 830 + SolutionIndex: 833 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132374,7 +132911,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 831 + SolutionIndex: 834 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132519,7 +133056,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 832 + SolutionIndex: 835 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS0_GRVW02_LPB00_PGR0_SNLL0_TT06_08_USFGRO00_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132668,7 +133205,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 833 + SolutionIndex: 836 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132813,7 +133350,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 834 + SolutionIndex: 837 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -132958,7 +133495,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 835 + SolutionIndex: 838 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133103,7 +133640,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 836 + SolutionIndex: 839 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133248,7 +133785,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 837 + SolutionIndex: 840 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133397,7 +133934,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 838 + SolutionIndex: 841 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133546,7 +134083,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 839 + SolutionIndex: 842 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133695,7 +134232,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 840 + SolutionIndex: 843 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x08_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133840,7 +134377,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 841 + SolutionIndex: 844 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -133989,7 +134526,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 842 + SolutionIndex: 845 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134134,7 +134671,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 843 + SolutionIndex: 846 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134283,7 +134820,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 844 + SolutionIndex: 847 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134432,7 +134969,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 845 + SolutionIndex: 848 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134577,7 +135114,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 846 + SolutionIndex: 849 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134726,7 +135263,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 847 + SolutionIndex: 850 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -134875,7 +135412,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 848 + SolutionIndex: 851 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135020,7 +135557,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 849 + SolutionIndex: 852 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135169,7 +135706,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 850 + SolutionIndex: 853 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135314,7 +135851,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 851 + SolutionIndex: 854 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135463,7 +136000,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 852 + SolutionIndex: 855 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135608,7 +136145,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 853 + SolutionIndex: 856 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135757,7 +136294,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 854 + SolutionIndex: 857 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -135906,7 +136443,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 855 + SolutionIndex: 858 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT096x128x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT06_08_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136051,7 +136588,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 856 + SolutionIndex: 859 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136200,7 +136737,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 857 + SolutionIndex: 860 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136349,7 +136886,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 858 + SolutionIndex: 861 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x096x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT08_06_USFGRO0_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136494,7 +137031,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 859 + SolutionIndex: 862 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136643,7 +137180,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 860 + SolutionIndex: 863 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136788,7 +137325,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 861 + SolutionIndex: 864 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -136937,7 +137474,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 862 + SolutionIndex: 865 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137082,7 +137619,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 863 + SolutionIndex: 866 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137231,7 +137768,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 864 + SolutionIndex: 867 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x24_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137376,7 +137913,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 865 + SolutionIndex: 868 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137525,7 +138062,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 866 + SolutionIndex: 869 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137670,7 +138207,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 867 + SolutionIndex: 870 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137819,7 +138356,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 868 + SolutionIndex: 871 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -137964,7 +138501,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 869 + SolutionIndex: 872 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138113,7 +138650,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 870 + SolutionIndex: 873 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138258,7 +138795,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 871 + SolutionIndex: 874 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138407,7 +138944,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 872 + SolutionIndex: 875 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138552,7 +139089,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 873 + SolutionIndex: 876 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138701,7 +139238,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 874 + SolutionIndex: 877 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138846,7 +139383,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 875 + SolutionIndex: 878 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS0_GRVW04_LPB00_PGR0_SNLL0_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -138995,7 +139532,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 876 + SolutionIndex: 879 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x32_DTL0_EPS1_GRVW04_LPB00_PGR1_SNLL1_TT08_08_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -139144,7 +139681,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 877 + SolutionIndex: 880 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS0_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO00_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139293,7 +139830,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 878 + SolutionIndex: 881 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x032x02_DTL0_EPS0_GRVW04_LPB00_PGR1_SNLL0_TT04_04_USFGRO00_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139442,7 +139979,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 879 + SolutionIndex: 882 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL1_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -139591,7 +140128,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 880 + SolutionIndex: 883 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -139740,7 +140277,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 881 + SolutionIndex: 884 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -139889,7 +140426,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 882 + SolutionIndex: 885 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x04_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -140038,7 +140575,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 883 + SolutionIndex: 886 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -140187,7 +140724,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 884 + SolutionIndex: 887 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -140336,7 +140873,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 885 + SolutionIndex: 888 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT008x008x08_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -140485,7 +141022,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 886 + SolutionIndex: 889 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT016x016x16_DTL0_EPS1_GRVW02_LPB00_PGR1_SNLL0_TT02_02_USFGRO01_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -140634,7 +141171,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 887 + SolutionIndex: 890 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -140783,7 +141320,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 888 + SolutionIndex: 891 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x08_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -140932,7 +141469,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 889 + SolutionIndex: 892 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -141081,7 +141618,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 890 + SolutionIndex: 893 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS1_GRVW04_LPB04_PGR1_SNLL1_TT04_08_USFGRO0_VW04_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141226,7 +141763,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 891 + SolutionIndex: 894 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT032x128x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG08_32_01 SubGroup0: 8 SubGroup1: 32 @@ -141371,7 +141908,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 892 + SolutionIndex: 895 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x032x16_DTL0_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT04_04_USFGRO01_VW02_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141516,7 +142053,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 893 + SolutionIndex: 896 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x16_DTL1_EPS0_GRVW01_LPB02_PGR0_SNLL0_TT08_04_USFGRO01_VW02_WG32_08_01 SubGroup0: 32 SubGroup1: 8 @@ -141661,7 +142198,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 894 + SolutionIndex: 897 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x032x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW01_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -141806,7 +142343,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 895 + SolutionIndex: 898 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW01_LPB04_PGR0_SNLL0_TT04_04_USFGRO01_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -141951,7 +142488,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 896 + SolutionIndex: 899 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT04_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -142096,7 +142633,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 897 + SolutionIndex: 900 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_EPS0_GRVW04_LPB04_PGR0_SNLL0_TT08_04_USFGRO0_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -142241,7 +142778,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 898 + SolutionIndex: 901 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW01_GSU08_PGR1_PLR1_TT08_04_USFGRO01_VW01_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -142383,7 +142920,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 899 + SolutionIndex: 902 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x032x08_DTL1_GRVW01_GSU01_PGR0_PLR1_TT08_04_USFGRO01_VW04_WG32_08_01_WGM08 SubGroup0: 32 SubGroup1: 8 @@ -142529,7 +143066,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 900 + SolutionIndex: 903 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW02_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -142675,7 +143212,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 901 + SolutionIndex: 904 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x08_DTL0_GRVW02_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW02_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -142821,7 +143358,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 902 + SolutionIndex: 905 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -142967,7 +143504,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 903 + SolutionIndex: 906 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x08_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG16_08_01_WGM08 SubGroup0: 16 SubGroup1: 8 @@ -143109,7 +143646,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 904 + SolutionIndex: 907 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG08_32_01_WGM01 SubGroup0: 8 SubGroup1: 32 @@ -143255,7 +143792,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 905 + SolutionIndex: 908 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143397,7 +143934,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 906 + SolutionIndex: 909 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT064x128x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT04_08_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143539,7 +144076,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 907 + SolutionIndex: 910 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR0_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143685,7 +144222,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 908 + SolutionIndex: 911 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT08_04_USFGRO0_VW04_WG16_16_01_WGM01 SubGroup0: 16 SubGroup1: 16 @@ -143831,7 +144368,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 909 + SolutionIndex: 912 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -143977,7 +144514,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 910 + SolutionIndex: 913 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x064x16_DTL0_GRVW04_GSU01_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM08 SubGroup0: 32 SubGroup1: 8 @@ -144134,7 +144671,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 + SolutionIndex: 914 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -144296,7 +144833,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 + SolutionIndex: 915 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -144458,7 +144995,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 + SolutionIndex: 916 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -144620,7 +145157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 + SolutionIndex: 917 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -144782,7 +145319,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 + SolutionIndex: 918 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_EPS1_GRVW4_GSU1_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -144944,7 +145481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 + SolutionIndex: 919 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW2_GSU1_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145106,7 +145643,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 + SolutionIndex: 920 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145268,7 +145805,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 + SolutionIndex: 921 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145430,7 +145967,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 + SolutionIndex: 922 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145592,7 +146129,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 + SolutionIndex: 923 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145754,7 +146291,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 + SolutionIndex: 924 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -145916,7 +146453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 + SolutionIndex: 925 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_4_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -146078,7 +146615,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 + SolutionIndex: 926 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -146240,7 +146777,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 + SolutionIndex: 927 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146402,7 +146939,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 + SolutionIndex: 928 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146564,7 +147101,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 + SolutionIndex: 929 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146726,7 +147263,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 + SolutionIndex: 930 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146888,7 +147425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 + SolutionIndex: 931 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147050,7 +147587,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 + SolutionIndex: 932 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147212,7 +147749,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 + SolutionIndex: 933 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147374,7 +147911,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 + SolutionIndex: 934 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147536,7 +148073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 + SolutionIndex: 935 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147698,7 +148235,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 + SolutionIndex: 936 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -147860,7 +148397,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 + SolutionIndex: 937 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148022,7 +148559,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 + SolutionIndex: 938 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148184,7 +148721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 + SolutionIndex: 939 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_EPS1_GRVW4_GSU1_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -148348,7 +148885,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 + SolutionIndex: 940 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148512,7 +149049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 + SolutionIndex: 941 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148676,7 +149213,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 + SolutionIndex: 942 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x32x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG64_4_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -148840,7 +149377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 + SolutionIndex: 943 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS1_PGR1_SNLL1_TT4_4_WG32_8_1_WGM4 StaggerU: 32 StaggerUMapping: 0 @@ -149004,7 +149541,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 + SolutionIndex: 944 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -149168,7 +149705,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 + SolutionIndex: 945 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM15 StaggerU: 32 StaggerUMapping: 0 @@ -149332,7 +149869,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 + SolutionIndex: 946 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -149496,7 +150033,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 + SolutionIndex: 947 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM17 StaggerU: 32 StaggerUMapping: 0 @@ -149656,7 +150193,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 + SolutionIndex: 948 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM17 StaggerU: 32 StaggerUMapping: 0 @@ -149820,7 +150357,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 + SolutionIndex: 949 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT4_8_WG32_8_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -149980,7 +150517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 + SolutionIndex: 950 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT256x16x8_SE_EPS0_PGR0_SNLL0_TT4_4_WG64_4_1_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -150144,7 +150681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 + SolutionIndex: 951 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -150308,7 +150845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 + SolutionIndex: 952 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM2 StaggerU: 32 StaggerUMapping: 0 @@ -150472,7 +151009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 + SolutionIndex: 953 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_16_1_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -150636,7 +151173,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 + SolutionIndex: 954 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM7 StaggerU: 32 StaggerUMapping: 0 @@ -150800,7 +151337,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 + SolutionIndex: 955 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM11 StaggerU: 32 StaggerUMapping: 0 @@ -150964,7 +151501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 + SolutionIndex: 956 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -151128,7 +151665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 + SolutionIndex: 957 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -151292,7 +151829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 + SolutionIndex: 958 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_8_2_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -151456,7 +151993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 + SolutionIndex: 959 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_PGR1_SNLL1_TT4_8_WG16_8_2_WGM32 StaggerU: 32 StaggerUMapping: 0 @@ -151620,7 +152157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 + SolutionIndex: 960 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151784,7 +152321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 + SolutionIndex: 961 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151948,7 +152485,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 + SolutionIndex: 962 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152112,7 +152649,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 + SolutionIndex: 963 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_PGR1_SNLL1_TT8_4_WG16_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152276,7 +152813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 + SolutionIndex: 964 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_PGR1_SNLL1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152443,7 +152980,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 + SolutionIndex: 965 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152606,7 +153143,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 + SolutionIndex: 966 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152773,7 +153310,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 + SolutionIndex: 967 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152936,7 +153473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 + SolutionIndex: 968 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153103,7 +153640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 + SolutionIndex: 969 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153266,7 +153803,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 + SolutionIndex: 970 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153433,7 +153970,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 + SolutionIndex: 971 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153596,7 +154133,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 + SolutionIndex: 972 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153763,7 +154300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 + SolutionIndex: 973 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153924,7 +154461,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 + SolutionIndex: 974 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154085,7 +154622,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 + SolutionIndex: 975 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154248,7 +154785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 + SolutionIndex: 976 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154415,7 +154952,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 + SolutionIndex: 977 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154580,7 +155117,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 + SolutionIndex: 978 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL0_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154743,7 +155280,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 + SolutionIndex: 979 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR0_SNLL1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154910,7 +155447,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 + SolutionIndex: 980 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155073,7 +155610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 + SolutionIndex: 981 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155240,7 +155777,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 + SolutionIndex: 982 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155403,7 +155940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 + SolutionIndex: 983 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155566,7 +156103,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 + SolutionIndex: 984 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155731,7 +156268,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 + SolutionIndex: 985 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155894,7 +156431,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 + SolutionIndex: 986 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156057,7 +156594,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 + SolutionIndex: 987 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156222,7 +156759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 + SolutionIndex: 988 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156385,7 +156922,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 + SolutionIndex: 989 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156548,7 +157085,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 + SolutionIndex: 990 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156709,7 +157246,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 + SolutionIndex: 991 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156870,7 +157407,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 + SolutionIndex: 992 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157031,7 +157568,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 + SolutionIndex: 993 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157192,7 +157729,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 + SolutionIndex: 994 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157357,7 +157894,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 + SolutionIndex: 995 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157520,7 +158057,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 + SolutionIndex: 996 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157687,7 +158224,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 + SolutionIndex: 997 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157850,7 +158387,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 + SolutionIndex: 998 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_SNLL1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158013,7 +158550,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 + SolutionIndex: 999 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158172,7 +158709,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 + SolutionIndex: 1000 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL1_GRVW4_LPB0_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158335,7 +158872,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 + SolutionIndex: 1001 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158496,7 +159033,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 + SolutionIndex: 1002 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158661,7 +159198,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 + SolutionIndex: 1003 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158822,7 +159359,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 + SolutionIndex: 1004 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158983,7 +159520,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 + SolutionIndex: 1005 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159144,7 +159681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 + SolutionIndex: 1006 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159309,7 +159846,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 + SolutionIndex: 1007 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL0_GRVW1_LPB1_PGR1_PLR1_SNLL1_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159470,7 +160007,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 + SolutionIndex: 1008 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159631,7 +160168,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 + SolutionIndex: 1009 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159792,7 +160329,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 + SolutionIndex: 1010 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159953,7 +160490,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 + SolutionIndex: 1011 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160114,7 +160651,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 + SolutionIndex: 1012 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160275,7 +160812,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 + SolutionIndex: 1013 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160436,7 +160973,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 + SolutionIndex: 1014 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR0_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160597,7 +161134,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 + SolutionIndex: 1015 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160758,7 +161295,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 + SolutionIndex: 1016 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160919,7 +161456,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 + SolutionIndex: 1017 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161080,7 +161617,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 + SolutionIndex: 1018 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161241,7 +161778,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 + SolutionIndex: 1019 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_DTL0_EPS0_FL0_GRVW1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161402,7 +161939,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 + SolutionIndex: 1020 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161563,7 +162100,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 + SolutionIndex: 1021 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161724,7 +162261,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 + SolutionIndex: 1022 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161885,7 +162422,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 + SolutionIndex: 1023 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162044,7 +162581,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 + SolutionIndex: 1024 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162204,7 +162741,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 + SolutionIndex: 1025 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162364,7 +162901,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 + SolutionIndex: 1026 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162524,7 +163061,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 + SolutionIndex: 1027 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162684,7 +163221,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 + SolutionIndex: 1028 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162844,7 +163381,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 + SolutionIndex: 1029 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163004,7 +163541,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 + SolutionIndex: 1030 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163168,7 +163705,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 + SolutionIndex: 1031 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163328,7 +163865,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 + SolutionIndex: 1032 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163488,7 +164025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 + SolutionIndex: 1033 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163648,7 +164185,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 + SolutionIndex: 1034 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163808,7 +164345,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 + SolutionIndex: 1035 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163968,7 +164505,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 + SolutionIndex: 1036 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164128,7 +164665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 + SolutionIndex: 1037 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164292,7 +164829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 + SolutionIndex: 1038 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164452,7 +164989,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 + SolutionIndex: 1039 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164616,7 +165153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 + SolutionIndex: 1040 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164776,7 +165313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 + SolutionIndex: 1041 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164940,7 +165477,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 + SolutionIndex: 1042 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB0_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165100,7 +165637,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 + SolutionIndex: 1043 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165260,7 +165797,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 + SolutionIndex: 1044 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165420,7 +165957,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 + SolutionIndex: 1045 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165580,7 +166117,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 + SolutionIndex: 1046 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165740,7 +166277,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 + SolutionIndex: 1047 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165904,7 +166441,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 + SolutionIndex: 1048 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166068,7 +166605,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 + SolutionIndex: 1049 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166232,7 +166769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 + SolutionIndex: 1050 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166392,7 +166929,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 + SolutionIndex: 1051 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166556,7 +167093,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 + SolutionIndex: 1052 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166720,7 +167257,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 + SolutionIndex: 1053 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166880,7 +167417,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 + SolutionIndex: 1054 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167044,7 +167581,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 + SolutionIndex: 1055 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167208,7 +167745,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 + SolutionIndex: 1056 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167368,7 +167905,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 + SolutionIndex: 1057 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167532,7 +168069,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 + SolutionIndex: 1058 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167696,7 +168233,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 + SolutionIndex: 1059 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167860,7 +168397,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 + SolutionIndex: 1060 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168020,7 +168557,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 + SolutionIndex: 1061 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168184,7 +168721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 + SolutionIndex: 1062 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168344,7 +168881,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 + SolutionIndex: 1063 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168508,7 +169045,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 + SolutionIndex: 1064 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168672,7 +169209,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 + SolutionIndex: 1065 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168836,7 +169373,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 + SolutionIndex: 1066 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168996,7 +169533,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 + SolutionIndex: 1067 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169160,7 +169697,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 + SolutionIndex: 1068 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169324,7 +169861,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 + SolutionIndex: 1069 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169488,7 +170025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 + SolutionIndex: 1070 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169652,7 +170189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 + SolutionIndex: 1071 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169816,7 +170353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 + SolutionIndex: 1072 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169980,7 +170517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 + SolutionIndex: 1073 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170144,7 +170681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 + SolutionIndex: 1074 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170308,7 +170845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 + SolutionIndex: 1075 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170472,7 +171009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 + SolutionIndex: 1076 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170632,7 +171169,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 + SolutionIndex: 1077 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170796,7 +171333,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 + SolutionIndex: 1078 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170960,7 +171497,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 + SolutionIndex: 1079 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171124,7 +171661,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 + SolutionIndex: 1080 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_2_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171288,7 +171825,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 + SolutionIndex: 1081 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171452,7 +171989,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 + SolutionIndex: 1082 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171616,7 +172153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 + SolutionIndex: 1083 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_2_VW2_WG8_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171776,7 +172313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 + SolutionIndex: 1084 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171936,7 +172473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 + SolutionIndex: 1085 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172096,7 +172633,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 + SolutionIndex: 1086 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172256,7 +172793,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 + SolutionIndex: 1087 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172416,7 +172953,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 + SolutionIndex: 1088 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172576,7 +173113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 + SolutionIndex: 1089 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172736,7 +173273,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 + SolutionIndex: 1090 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172896,7 +173433,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 + SolutionIndex: 1091 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173056,7 +173593,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 + SolutionIndex: 1092 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173216,7 +173753,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 + SolutionIndex: 1093 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173376,7 +173913,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 + SolutionIndex: 1094 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173536,7 +174073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 + SolutionIndex: 1095 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173700,7 +174237,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 + SolutionIndex: 1096 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173864,7 +174401,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 + SolutionIndex: 1097 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG2_32_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174024,7 +174561,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 + SolutionIndex: 1098 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174188,7 +174725,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 + SolutionIndex: 1099 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174352,7 +174889,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 + SolutionIndex: 1100 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR0_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174516,7 +175053,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 + SolutionIndex: 1101 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174676,7 +175213,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 + SolutionIndex: 1102 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174840,7 +175377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 + SolutionIndex: 1103 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175000,7 +175537,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 + SolutionIndex: 1104 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT4x64x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG2_32_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175164,7 +175701,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 + SolutionIndex: 1105 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175328,7 +175865,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 + SolutionIndex: 1106 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175488,7 +176025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 + SolutionIndex: 1107 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175652,7 +176189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 + SolutionIndex: 1108 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175816,7 +176353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 + SolutionIndex: 1109 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG2_16_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175980,7 +176517,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 + SolutionIndex: 1110 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176144,7 +176681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 + SolutionIndex: 1111 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176308,7 +176845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 + SolutionIndex: 1112 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176472,7 +177009,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 + SolutionIndex: 1113 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176636,7 +177173,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 + SolutionIndex: 1114 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176800,7 +177337,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 + SolutionIndex: 1115 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176964,7 +177501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 + SolutionIndex: 1116 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177128,7 +177665,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 + SolutionIndex: 1117 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177292,7 +177829,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 + SolutionIndex: 1118 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177456,7 +177993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 + SolutionIndex: 1119 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177620,7 +178157,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 + SolutionIndex: 1120 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG8_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177784,7 +178321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 + SolutionIndex: 1121 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG32_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177948,7 +178485,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 + SolutionIndex: 1122 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_2_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178112,7 +178649,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 + SolutionIndex: 1123 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR0_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178272,7 +178809,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 + SolutionIndex: 1124 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178436,7 +178973,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 + SolutionIndex: 1125 SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178600,20 +179137,184 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + SolutionIndex: 1126 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT2_4_VW2_WG8_4_8_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 4 + SubGroupA: 8 + SubGroupB: 4 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 4, 8] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 + LdcEqualsLdd: false + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1127 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 4 - SubGroupA: 8 - SubGroupB: 4 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -178621,10 +179322,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 4, 8] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -178764,8 +179465,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1128 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178786,7 +179487,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -178808,7 +179509,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -178836,13 +179537,9 @@ LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -178883,8 +179580,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -178928,8 +179625,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1129 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -178937,7 +179634,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -178950,7 +179647,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -178966,7 +179663,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -178992,17 +179689,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179015,7 +179712,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179031,8 +179728,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179088,8 +179785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1130 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179112,7 +179809,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179132,37 +179829,205 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1131 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -179188,11 +180053,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179203,8 +180068,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -179248,8 +180113,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1132 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179257,7 +180122,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -179268,9 +180133,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -179286,7 +180151,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179311,18 +180176,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7168 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -179339,10 +180204,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -179351,12 +180216,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179412,8 +180277,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1133 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179422,10 +180287,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -179434,9 +180299,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179450,7 +180315,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -179458,41 +180323,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -179503,7 +180368,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179516,7 +180381,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -179576,8 +180441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1134 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179596,11 +180461,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179614,49 +180479,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -179667,7 +180528,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -179680,11 +180541,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -179695,7 +180556,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -179740,8 +180601,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1135 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179749,7 +180610,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -179760,11 +180621,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -179904,8 +180765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 + SolutionIndex: 1136 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -179926,7 +180787,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -179942,13 +180803,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -179968,34 +180829,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180003,13 +180868,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180019,7 +180884,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -180064,16 +180929,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1137 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -180085,10 +180950,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -180102,13 +180967,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -180128,21 +180993,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3088 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -180155,7 +181016,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -180171,8 +181032,8 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -180183,7 +181044,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -180228,8 +181089,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1138 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180237,7 +181098,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -180250,9 +181111,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -180392,8 +181253,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1139 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -180414,7 +181275,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -180455,34 +181316,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 8 - LSPB: 32 - LVCA: 32 + LSPA: 16 + LSPB: 64 + LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180491,13 +181352,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180508,7 +181369,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -180552,14 +181413,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 + SolutionIndex: 1140 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -180573,7 +181434,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -180615,22 +181476,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 + LSPB: 64 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 8192 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -180644,10 +181505,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 4 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -180716,15 +181577,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1141 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -180737,7 +181598,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -180779,34 +181640,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180815,13 +181676,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180876,14 +181737,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1142 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -180897,8 +181758,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -180920,7 +181781,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -180939,38 +181800,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 LVCB: 8 - LVPA: 8 - LVPB: 32 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -180979,13 +181836,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -180995,7 +181852,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181040,16 +181897,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_4_WGM8 + SolutionIndex: 1143 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181061,8 +181918,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -181078,13 +181935,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -181104,30 +181961,34 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 - LSPB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -181139,13 +182000,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181155,8 +182016,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -181200,8 +182061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1144 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -181209,7 +182070,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181221,10 +182082,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181238,13 +182099,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -181264,34 +182125,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCB: 32 + LSPA: 16 LSPB: 32 LVCA: 32 - LVCB: 8 - LVPA: 4 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181299,13 +182164,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181315,7 +182180,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -181360,16 +182225,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1145 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false + SubGroupB: 8 + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -181381,10 +182246,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -181423,38 +182288,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -181463,11 +182328,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 1024 PackBatchDims: 0 @@ -181524,14 +182389,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM1 + SolutionIndex: 1146 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -181545,7 +182410,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -181589,20 +182454,20 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 16 + LVPA: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -181617,9 +182482,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181631,9 +182496,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181688,15 +182553,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM1 + SolutionIndex: 1147 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -181709,8 +182574,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -181751,39 +182616,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 64 + LSPB: 32 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -181791,13 +182656,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -181852,15 +182717,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG32_16_2_WGM1 + SolutionIndex: 1148 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -181873,8 +182738,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -181890,7 +182755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -181898,52 +182763,52 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 16 - LVPB: 32 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -181955,13 +182820,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182016,8 +182881,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_16_4_WGM8 + SolutionIndex: 1149 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182036,11 +182901,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -182054,7 +182919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -182062,33 +182927,33 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCB: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7232 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 @@ -182096,22 +182961,22 @@ LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182119,13 +182984,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -182180,31 +183045,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG16_8_4_WGM8 + SolutionIndex: 1150 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -182224,7 +183089,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -182252,13 +183117,9 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182299,8 +183160,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -182344,8 +183205,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1151 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182353,7 +183214,7 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -182366,7 +183227,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182407,22 +183268,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182436,9 +183297,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -182447,11 +183308,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -182508,8 +183369,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1152 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182518,10 +183379,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -182530,7 +183391,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182552,7 +183413,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -182580,9 +183441,13 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182597,9 +183462,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182607,12 +183472,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -182623,8 +183488,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -182668,8 +183533,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 + SolutionIndex: 1153 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182677,12 +183542,12 @@ SubGroup1: 16 SubGroupA: 16 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] + SuppressNoLoadLoop: true + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -182690,7 +183555,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -182731,22 +183596,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 - LSPA: 16 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182760,9 +183625,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -182771,11 +183636,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -182832,8 +183697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1154 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -182842,10 +183707,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -182870,7 +183735,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -182896,21 +183761,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -182923,11 +183788,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 128 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -182935,11 +183800,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -182996,8 +183861,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM16 + SolutionIndex: 1155 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183006,11 +183871,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -183018,9 +183883,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183034,7 +183899,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -183043,7 +183908,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183059,18 +183924,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSCA: 64 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7232 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 @@ -183080,18 +183945,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183099,13 +183964,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183160,20 +184025,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1156 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183181,10 +184046,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -183225,14 +184090,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -183244,14 +184109,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -183263,13 +184128,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183324,28 +184189,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 + SolutionIndex: 1157 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183387,22 +184252,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -183416,9 +184281,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -183427,11 +184292,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 512 PackBatchDims: 0 @@ -183488,8 +184353,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1158 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -183498,10 +184363,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183652,28 +184517,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM1 + SolutionIndex: 1159 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183717,16 +184582,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 32 LVCB: 16 - LVPA: 4 - LVPB: 16 + LVPA: 8 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 LdsOffsetB: 4096 @@ -183736,18 +184601,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183759,9 +184624,9 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -183816,20 +184681,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG16_8_4_WGM1 + SolutionIndex: 1160 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -183837,7 +184702,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -183863,7 +184728,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -183882,19 +184747,19 @@ LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -183909,9 +184774,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -183919,8 +184784,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -183980,20 +184845,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM1 + SolutionIndex: 1161 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184001,8 +184866,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184027,7 +184892,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184043,38 +184908,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 32 LSPB: 64 - LVCA: 32 - LVCB: 16 + LVCA: 16 + LVCB: 8 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -184089,7 +184954,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184144,29 +185009,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM1 + SolutionIndex: 1162 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184207,39 +185072,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 32 - LSPB: 32 - LVCA: 16 + LSPB: 64 + LVCA: 32 LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184247,13 +185112,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184308,15 +185173,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_4_WGM8 + SolutionIndex: 1163 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -184329,7 +185194,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -184373,14 +185238,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 32 - LSPB: 64 + LSPA: 16 + LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 + LdsNumElements: 12416 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -184392,14 +185257,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -184411,13 +185276,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184472,29 +185337,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM8 + SolutionIndex: 1164 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184535,39 +185400,39 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 64 - LVCA: 32 + LSPB: 32 + LVCA: 16 LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184575,13 +185440,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184592,7 +185457,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -184636,15 +185501,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM8 + SolutionIndex: 1165 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 16 - SubGroupA: 32 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -184657,8 +185522,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -184701,14 +185566,14 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 + LSPA: 32 + LSPB: 64 LVCA: 16 LVCB: 8 - LVPA: 4 - LVPB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 12416 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 @@ -184720,14 +185585,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -184739,13 +185604,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -184800,28 +185665,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 + SolutionIndex: 1166 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -184847,7 +185712,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -184866,19 +185731,19 @@ LSCA: 64 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -184893,9 +185758,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -184903,8 +185768,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -184920,7 +185785,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -184964,20 +185829,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR0_SNLL1_TT4_4_VW4_WG16_8_4_WGM16 + SolutionIndex: 1167 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -184985,7 +185850,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [8, 16, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185011,7 +185876,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -185027,38 +185892,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 32 LSPA: 32 LSPB: 64 - LVCA: 16 - LVCB: 8 + LVCA: 32 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedA: 4096 LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185073,7 +185938,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 1024 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185128,28 +185993,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_4_WGM16 + SolutionIndex: 1168 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 32 + SubGroup1: 16 + SubGroupA: 32 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 4] + WorkGroup: [32, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -185166,49 +186031,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCA: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -185219,10 +186080,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185231,13 +186092,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185247,8 +186108,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -185292,31 +186153,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_4_WGM16 + SolutionIndex: 1169 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 4] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185330,15 +186191,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -185346,47 +186207,43 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 - LSPA: 32 + LSCA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LVCA: 8 + LVCB: 4 + LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 + LoopUnroll: 2 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185395,13 +186252,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 1024 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -185411,7 +186268,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185456,16 +186313,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_16_2_WGM16 + SolutionIndex: 1170 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 32 + SubGroupA: 4 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -185476,11 +186333,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185494,59 +186351,63 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 8 - LVCB: 4 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185555,12 +186416,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -185571,8 +186432,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -185616,31 +186477,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1171 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + SuppressNoLoadLoop: true + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185679,34 +186540,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 8 + LVCA: 16 LVCB: 4 LVPA: 4 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 2 - MacroTile0: 16 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185715,8 +186576,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -185776,14 +186637,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -185797,8 +186658,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -185814,49 +186675,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -185867,10 +186724,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -185879,12 +186736,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -185895,7 +186752,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -185940,8 +186797,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -185949,22 +186806,22 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -185984,7 +186841,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186013,8 +186870,12 @@ LVPB: 32 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186055,7 +186916,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186100,8 +186961,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186109,7 +186970,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186122,7 +186983,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -186138,7 +186999,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186164,17 +187025,17 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186187,7 +187048,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -186204,7 +187065,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -186260,8 +187121,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186282,9 +187143,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -186298,7 +187159,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -186324,21 +187185,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 32 + LVCB: 8 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186351,7 +187212,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 64 MacroTileA: 32 @@ -186368,7 +187229,7 @@ NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -186424,8 +187285,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186446,9 +187307,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -186487,34 +187348,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -186523,8 +187384,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -186584,14 +187445,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -186605,7 +187466,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -186628,7 +187489,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186657,12 +187518,8 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -186703,7 +187560,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186748,8 +187605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -186757,7 +187614,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186770,7 +187627,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -186792,7 +187649,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186811,34 +187668,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -186847,8 +187708,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -186863,7 +187724,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -186908,16 +187769,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG4_16_4_WGM1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -186929,8 +187790,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -186952,7 +187813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -186981,8 +187842,12 @@ LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -187023,7 +187888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -187068,8 +187933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187077,7 +187942,7 @@ SubGroup1: 16 SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -187090,7 +187955,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187106,7 +187971,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187131,14 +187996,14 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 512 @@ -187152,18 +188017,18 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 8 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -187171,8 +188036,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -187232,15 +188097,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM8 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 4 + SubGroup1: 8 + SubGroupA: 4 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -187253,10 +188118,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] + WorkGroup: [4, 8, 8] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187278,41 +188143,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 - LVPA: 8 + LVCB: 4 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -187324,9 +188189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187335,12 +188200,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187396,8 +188261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_16_2_WGM16 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187406,19 +188271,19 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187434,7 +188299,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187442,56 +188307,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 - LVPB: 8 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 8 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -187499,12 +188364,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187560,31 +188425,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG4_8_8_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 8 - SubGroupA: 4 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 8, 8] + VectorWidth: 4 + WorkGroup: [8, 16, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187724,8 +188589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187746,7 +188611,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -187762,7 +188627,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187787,18 +188652,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -187815,10 +188680,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187827,12 +188692,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -187888,8 +188753,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1180 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM8 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -187898,10 +188763,10 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -187910,9 +188775,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 16, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -187926,7 +188791,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -187951,18 +188816,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 7296 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2176 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -187979,10 +188844,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -187991,12 +188856,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188052,8 +188917,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1181 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1186 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188062,10 +188927,10 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -188076,7 +188941,7 @@ WorkGroup: [8, 16, 2] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188124,30 +188989,30 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -188155,12 +189020,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188216,15 +189081,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1182 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM1 + SolutionIndex: 1187 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -188237,8 +189102,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -188254,7 +189119,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188262,56 +189127,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 2176 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 4 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -188319,12 +189184,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188380,15 +189245,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1183 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_16_2_WGM16 + SolutionIndex: 1188 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 @@ -188400,11 +189265,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 16 + VectorWidth: 2 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188418,49 +189283,45 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -188471,7 +189332,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188484,7 +189345,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -188499,7 +189360,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -188544,8 +189405,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1184 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB0_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1189 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188553,7 +189414,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -188564,11 +189425,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188708,8 +189569,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1185 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1190 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188730,7 +189591,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -188746,13 +189607,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -188772,17 +189633,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -188795,7 +189660,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188811,8 +189676,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -188823,7 +189688,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -188868,8 +189733,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1186 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1191 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -188877,7 +189742,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -188890,9 +189755,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -188906,7 +189771,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -188932,21 +189797,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -188959,7 +189824,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -188975,8 +189840,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189032,8 +189897,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1187 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1192 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189054,9 +189919,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189196,8 +190061,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1188 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM1 + SolutionIndex: 1193 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189218,7 +190083,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -189234,7 +190099,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189242,7 +190107,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -189250,37 +190115,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -189288,9 +190153,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -189299,12 +190164,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189360,14 +190225,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1189 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1194 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -189380,11 +190245,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189398,7 +190263,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189406,7 +190271,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -189414,37 +190279,37 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -189452,9 +190317,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -189463,12 +190328,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -189524,14 +190389,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1190 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1195 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true ThreadTile: [4, 4] @@ -189544,11 +190409,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -189688,8 +190553,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1191 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1196 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189710,7 +190575,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -189735,7 +190600,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -189754,19 +190619,19 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 8192 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -189781,9 +190646,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -189791,8 +190656,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -189852,8 +190717,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1192 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1197 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -189862,11 +190727,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -189874,7 +190739,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -189890,7 +190755,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -189899,7 +190764,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -189916,21 +190781,21 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -189943,7 +190808,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -189959,7 +190824,7 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -190016,8 +190881,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1193 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1198 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190038,9 +190903,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190054,7 +190919,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -190063,7 +190928,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190079,18 +190944,182 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 128 + LSCB: 32 LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPB: 32 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 8192 + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 4 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 512 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1199 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 32 + SubGroup1: 4 + SubGroupA: 32 + SubGroupB: 4 + SuppressNoLoadLoop: true + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [32, 4, 4] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 6272 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 @@ -190100,7 +191129,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -190108,10 +191137,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -190119,8 +191148,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -190180,31 +191209,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1194 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG16_8_2_WGM16 + SolutionIndex: 1200 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -190344,8 +191373,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1195 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM1 + SolutionIndex: 1201 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190366,7 +191395,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190388,10 +191417,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -190407,22 +191436,18 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 16384 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -190436,9 +191461,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -190447,13 +191472,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -190463,7 +191488,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190508,29 +191533,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1196 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_8_VW4_WG32_4_4_WGM1 + SolutionIndex: 1202 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 4 - SubGroupA: 32 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 8] + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190672,8 +191697,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1197 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM1 + SolutionIndex: 1203 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -190694,7 +191719,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -190744,7 +191769,7 @@ LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 + LdsNumElements: 8192 LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 @@ -190756,14 +191781,14 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -190836,20 +191861,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1198 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM8 + SolutionIndex: 1204 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -190857,7 +191882,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] + WorkGroup: [8, 8, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -190880,7 +191905,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 @@ -190899,34 +191924,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -190935,11 +191964,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -190951,7 +191980,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -190996,16 +192025,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1199 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS0_GRVW4_GSU1_LPB4_PGR0_PLR1_SNLL0_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1205 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: false + SuppressNoLoadLoop: true ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191017,8 +192046,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191160,8 +192189,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1200 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1206 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191182,7 +192211,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191204,54 +192233,50 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -191264,11 +192289,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191279,7 +192304,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191324,29 +192349,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1201 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT8_4_VW4_WG8_8_4_WGM8 + SolutionIndex: 1207 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 8 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true - ThreadTile: [8, 4] - ThreadTile0: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191368,43 +192393,39 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 64 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191428,11 +192449,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191443,7 +192464,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191488,8 +192509,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1202 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_8_2_WGM16 + SolutionIndex: 1208 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191497,7 +192518,7 @@ SubGroup1: 8 SubGroupA: 16 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191508,9 +192529,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191534,41 +192555,41 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 4 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191592,11 +192613,11 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191652,8 +192673,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1203 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG8_8_4_WGM16 + SolutionIndex: 1209 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191661,7 +192682,7 @@ SubGroup1: 8 SubGroupA: 8 SubGroupB: 8 - SuppressNoLoadLoop: true + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -191672,9 +192693,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191696,7 +192717,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -191715,34 +192736,38 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 - LSPA: 8 + LSPA: 16 LSPB: 16 - LVCA: 32 + LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -191751,11 +192776,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -191767,7 +192792,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191812,14 +192837,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1204 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM1 + SolutionIndex: 1210 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] @@ -191833,8 +192858,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -191850,7 +192875,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191867,7 +192892,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -191875,20 +192900,20 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 + LSCA: 16 + LSCB: 8 LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 + LSPB: 32 + LVCA: 8 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetB: 128 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191899,10 +192924,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -191911,12 +192936,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -191928,7 +192953,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -191972,31 +192997,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1205 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS0_GRVW2_GSU4_LPB2_PGR0_PLR1_SNLL0_TT4_4_VW2_WG16_8_2_WGM8 + SolutionIndex: 1211 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192010,13 +193035,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -192027,7 +193052,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -192036,27 +193061,23 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 + LSCB: 8 + LSPA: 8 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -192079,8 +193100,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -192091,7 +193112,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192136,31 +193157,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1206 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM8 + SolutionIndex: 1212 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192174,13 +193195,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -192191,7 +193212,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -192200,34 +193221,30 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -192243,8 +193260,8 @@ NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -192255,8 +193272,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -192300,31 +193317,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1207 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_GRVW2_GSU4_LPB2_PGR1_PLR1_SNLL0_TT4_4_VW2_WG8_8_4_WGM16 + SolutionIndex: 1213 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192338,7 +193355,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192363,34 +193380,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 - LSCB: 8 - LSPA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 32 - LVCA: 8 - LVCB: 4 - LVPA: 4 + LVCA: 16 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 128 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 32 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192399,8 +193416,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192416,7 +193433,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -192460,14 +193477,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1208 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1214 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192481,10 +193498,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -192532,11 +193549,11 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 528 LdsOffsetA: 0 LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192620,8 +193637,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1209 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1215 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192683,34 +193700,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192719,8 +193736,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192780,14 +193797,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1210 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR0_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1216 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192801,8 +193818,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -192843,34 +193860,34 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -192879,8 +193896,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -192940,14 +193957,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1211 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS0_GRVW2_GSU1_LPB0_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1217 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [2, 2] @@ -192961,8 +193978,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -192978,13 +193995,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193004,23 +194021,27 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 + LSCB: 16 + LSPA: 16 LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 528 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193055,7 +194076,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193100,20 +194121,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1212 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG16_16_1_WGM16 + SolutionIndex: 1218 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -193121,10 +194142,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 16 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193163,35 +194184,35 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 16 LSPA: 16 LSPB: 32 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1568 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193199,12 +194220,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -193260,29 +194281,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1213 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1219 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -193298,7 +194319,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193318,29 +194339,29 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 32 + LSCB: 32 + LSPA: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 512 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193349,9 +194370,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193359,7 +194380,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -193420,15 +194441,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1214 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1220 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -193441,10 +194462,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193458,7 +194479,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193478,21 +194499,21 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3104 + LdsNumElements: 3136 LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -193504,7 +194525,7 @@ LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -193512,10 +194533,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193523,8 +194544,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -193584,20 +194605,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1215 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_16_2_WGM1 + SolutionIndex: 1221 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 8 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 8 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -193605,10 +194626,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [8, 8, 4] + WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193622,13 +194643,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193648,34 +194669,38 @@ InnerUnroll: 1 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1568 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193683,12 +194708,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -193699,8 +194724,8 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -193744,31 +194769,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1216 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR0_SNLL0_TT2_4_VW2_WG16_16_1_WGM16 + SolutionIndex: 1222 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 - ThreadTile1: 4 - ThreadTileA: 2 - ThreadTileB: 4 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: true + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [8, 8, 4] WorkGroupMapping: 16 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -193788,7 +194813,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 @@ -193807,18 +194832,22 @@ GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 + LSPA: 16 + LSPB: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 - LVPB: 8 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -193832,10 +194861,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -193847,7 +194876,7 @@ NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -193859,7 +194888,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -193904,16 +194933,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1217 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS0_GRVW2_GSU1_LPB2_PGR0_PLR1_SNLL0_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1223 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 + SuppressNoLoadLoop: true ThreadTile: [2, 2] ThreadTile0: 2 ThreadTile1: 2 @@ -193925,7 +194954,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -193962,27 +194991,27 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 + LSPA: 16 LSPB: 16 - LVCA: 8 + LVCA: 16 LVCB: 16 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3136 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -193996,9 +195025,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 16 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194007,11 +195036,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -194068,28 +195097,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1218 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1224 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 8, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -194114,7 +195143,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194122,33 +195151,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -194160,9 +195189,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194171,7 +195200,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -194232,29 +195261,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1219 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT4_2_VW2_WG8_8_4_WGM16 + SolutionIndex: 1225 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 8, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194278,7 +195307,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194286,48 +195315,48 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 128 LSCB: 32 LSPA: 16 - LSPB: 8 - LVCA: 16 + LSPB: 32 + LVCA: 32 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 13440 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -194335,13 +195364,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 512 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -194396,29 +195425,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1220 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_2_VW2_WG16_4_4_WGM16 + SolutionIndex: 1226 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: true - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + VectorWidth: 4 + WorkGroup: [32, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194442,7 +195471,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -194450,33 +195479,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -194488,9 +195517,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -194499,7 +195528,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -194560,8 +195589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1221 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_GRVW2_GSU1_LPB2_PGR1_PLR1_SNLL1_TT2_4_VW2_WG16_4_4_WGM16 + SolutionIndex: 1227 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194570,19 +195599,19 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: true - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194724,8 +195753,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1222 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM1 + SolutionIndex: 1228 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194746,7 +195775,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -194888,8 +195917,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1223 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM1 + SolutionIndex: 1229 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -194910,171 +195939,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [32, 8, 2] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 4 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 4 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MinGlobalWriteVectorWidth: 1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1224 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM8 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 4 - SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 16 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -195082,7 +195947,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -195090,16 +195955,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195114,40 +195979,41 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195155,11 +196021,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195171,6 +196039,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -195178,6 +196047,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -195216,16 +196086,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1225 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG16_4_4_WGM16 + SolutionIndex: 1230 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 - SuppressNoLoadLoop: true + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -195237,16 +196107,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -195254,7 +196122,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195263,7 +196131,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -195278,29 +196146,30 @@ GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 + InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 32 - LVCA: 32 - LVCB: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 13440 - LdsNumElementsAlignedA: 4096 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4096 - LdsOffsetB_Blk: 12288 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -195308,10 +196177,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -195319,13 +196188,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 512 + NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -195335,6 +196204,7 @@ PerformanceWaitCount: -1 PerformanceWaitLocation: -1 PersistentKernel: 0 + PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: @@ -195342,6 +196212,7 @@ Batched: true ComplexConjugateA: false ComplexConjugateB: false + ComputeDataType: 0 DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -195380,16 +196251,16 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1226 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x32_SE_EPS1_GRVW4_GSU1_LPB4_PGR1_PLR1_SNLL1_TT4_4_VW4_WG32_8_2_WGM16 + SolutionIndex: 1231 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 - SuppressNoLoadLoop: true + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 ThreadTile1: 4 @@ -195401,10 +196272,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 2] - WorkGroupMapping: 16 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -195418,48 +196289,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -195472,10 +196343,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -195484,13 +196355,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195504,7 +196373,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -195549,8 +196418,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1227 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1232 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -195559,21 +196428,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -195585,7 +196456,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -195593,40 +196464,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -195639,10 +196510,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -195651,11 +196522,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -195669,7 +196540,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -195714,8 +196585,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1228 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1233 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -195724,22 +196595,22 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -195836,7 +196707,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -195881,8 +196752,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1229 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1234 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196003,7 +196874,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -196048,8 +196919,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1230 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1235 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196086,14 +196957,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -196112,22 +196983,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1536 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196140,11 +197011,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196154,10 +197025,12 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 2 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 3 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196215,8 +197088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1231 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1236 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196225,11 +197098,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196239,9 +197112,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196262,14 +197133,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -196279,22 +197150,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196308,10 +197179,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196323,7 +197194,7 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -196382,8 +197253,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1232 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1237 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196392,11 +197263,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196407,7 +197278,7 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -196420,7 +197291,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -196428,40 +197299,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 LSPB: 32 - LVCA: 32 + LVCA: 128 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1536 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196474,11 +197345,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + LoopUnroll: 8 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196486,14 +197357,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 2 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196551,8 +197422,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1233 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1238 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196561,21 +197432,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196594,16 +197465,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -196613,22 +197484,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -196642,10 +197513,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -196653,11 +197524,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -196671,7 +197544,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -196716,8 +197589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1234 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1239 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196726,11 +197599,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -196741,8 +197614,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -196762,30 +197633,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -196822,12 +197693,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -196885,8 +197756,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1235 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1240 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -196901,7 +197772,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -196929,7 +197800,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -196937,11 +197808,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -196949,9 +197820,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -196988,12 +197859,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197052,8 +197923,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1236 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1241 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197072,7 +197943,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -197096,7 +197967,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197104,11 +197975,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197116,9 +197987,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197155,12 +198026,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197219,8 +198090,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1237 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1242 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197239,9 +198110,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -197341,7 +198212,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -197386,8 +198257,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1238 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1243 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197422,7 +198293,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -197431,7 +198302,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -197449,12 +198320,179 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1244 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -197489,12 +198527,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197508,7 +198544,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197553,8 +198589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1239 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1245 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197573,11 +198609,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197596,8 +198634,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197605,11 +198643,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197617,9 +198655,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197656,12 +198694,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197720,8 +198756,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1240 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1246 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197740,11 +198776,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197756,16 +198794,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -197783,21 +198821,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -197810,7 +198848,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -197824,11 +198862,9 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -197842,7 +198878,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -197887,8 +198923,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1241 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1247 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -197911,7 +198947,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -197931,7 +198969,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -197939,11 +198977,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -197951,9 +198989,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -197990,10 +199028,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198052,8 +199090,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1242 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1248 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198072,7 +199110,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -198098,7 +199136,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -198106,11 +199144,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -198118,9 +199156,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -198157,10 +199195,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198219,8 +199257,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1243 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1249 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198239,9 +199277,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -198341,7 +199379,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -198386,8 +199424,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1244 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1250 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198408,7 +199446,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -198424,7 +199462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -198433,7 +199471,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -198451,21 +199489,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -198478,7 +199516,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -198494,7 +199532,7 @@ NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -198508,7 +199546,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -198553,8 +199591,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1245 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1251 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198577,7 +199615,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -198598,37 +199636,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -198647,9 +199685,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198657,12 +199695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -198720,8 +199760,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1246 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1252 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198730,23 +199770,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -198765,8 +199803,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -198774,11 +199812,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -198786,16 +199824,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -198814,9 +199852,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198824,12 +199862,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -198842,7 +199882,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -198887,8 +199927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1247 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1253 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -198897,23 +199937,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -198925,48 +199963,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 8 + LSCB: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -198979,11 +200017,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -198991,12 +200029,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199054,8 +200094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1248 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1254 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199064,23 +200104,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -199100,30 +200138,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199160,12 +200198,12 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199178,7 +200216,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -199223,8 +200261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1249 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_WG16_16_1_WGM8 + SolutionIndex: 1255 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199239,13 +200277,13 @@ ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199267,19 +200305,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -199287,10 +200325,10 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199326,13 +200364,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199390,8 +200428,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1250 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1256 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199410,7 +200448,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -199434,19 +200472,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -199454,10 +200492,10 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -199493,13 +200531,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -199557,8 +200595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1251 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1257 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199577,9 +200615,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199679,7 +200717,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -199724,8 +200762,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1252 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1258 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199746,7 +200784,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -199760,7 +200798,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -199787,6 +200825,173 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 + LdcEqualsLdd: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MinGlobalWriteVectorWidth: 1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0Indices: [I] + PackedC1Indices: [J] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SilentHighPrecisionAccumulate: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStrides: false + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1259 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: false + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 8 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 128 @@ -199828,8 +201033,6 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -199891,8 +201094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1253 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1260 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -199916,6 +201119,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -199934,7 +201139,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -199995,8 +201200,6 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -200058,8 +201261,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1254 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1261 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -200080,9 +201283,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200100,42 +201305,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 32 LVCA: 32 - LVCB: 2 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -200149,10 +201350,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200160,8 +201361,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -200179,8 +201380,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -200225,8 +201426,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1255 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1262 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -200235,19 +201436,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -200261,7 +201462,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200269,46 +201470,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -200316,10 +201517,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200327,8 +201528,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 @@ -200392,31 +201593,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1256 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM1 + SolutionIndex: 1263 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [8, 16, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200428,7 +201629,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200436,57 +201637,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200494,12 +201695,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -200512,7 +201713,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -200557,31 +201758,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1257 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1264 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 8 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [8, 16, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -200595,7 +201796,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200603,57 +201804,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -200661,11 +201862,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -200724,31 +201925,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1258 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_WG16_16_1_WGM8 + SolutionIndex: 1265 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -200762,57 +201963,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 LVPA: 8 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 6208 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -200826,12 +202031,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -200843,8 +202048,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -200889,31 +202094,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1259 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_WG16_16_1_WGM1 + SolutionIndex: 1266 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -200925,7 +202130,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -200941,7 +202146,7 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -200952,21 +202157,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 + LVCB: 8 LVPA: 8 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -200979,7 +202184,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -200993,12 +202198,12 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -201056,31 +202261,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1260 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1267 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201092,15 +202297,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -201108,34 +202313,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201146,10 +202351,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201158,12 +202363,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -201221,33 +202428,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1261 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG8_16_2_WGM1 + SolutionIndex: 1268 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201266,43 +202471,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 - LSPB: 16 + LSPB: 32 LVCA: 16 - LVCB: 16 - LVPA: 8 + LVCB: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201314,10 +202519,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -201325,8 +202530,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -201388,8 +202595,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1262 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1269 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201398,23 +202605,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201433,7 +202638,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -201452,22 +202657,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 - LSPA: 16 + LSPA: 8 LSPB: 16 - LVCA: 16 + LVCA: 32 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6208 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -201481,9 +202686,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201492,13 +202697,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -201557,8 +202760,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1263 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM1 + SolutionIndex: 1270 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201567,10 +202770,10 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -201579,9 +202782,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201600,8 +202805,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -201613,30 +202818,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -201648,9 +202853,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201659,10 +202864,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -201724,8 +202927,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1264 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1271 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201734,21 +202937,23 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201760,16 +202965,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -201787,21 +202992,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 7296 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1152 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -201814,7 +203019,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 32 MacroTileA: 64 @@ -201828,11 +203033,9 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -201891,8 +203094,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1265 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1272 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -201915,7 +203118,9 @@ WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -201935,7 +203140,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -201947,44 +203152,44 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 8 LSCB: 32 - LSPA: 16 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 + LVPA: 32 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -201993,13 +203198,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -202058,29 +203263,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1266 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1273 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -202101,57 +203306,57 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 4 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202160,12 +203365,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202223,33 +203430,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1267 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1274 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202261,54 +203466,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 32 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -202316,9 +203521,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202327,12 +203532,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202390,33 +203597,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1268 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1275 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202436,56 +203641,56 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 7296 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1152 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202494,12 +203699,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202557,28 +203762,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1269 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_8_2_WGM8 + SolutionIndex: 1276 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -202602,8 +203807,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -202615,28 +203820,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202650,9 +203855,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -202661,10 +203866,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -202726,8 +203929,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1270 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1277 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -202736,21 +203939,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -202770,40 +203975,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 - LVCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 6720 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB_Blk: 4608 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202818,9 +204023,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -202828,14 +204033,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -202848,7 +204053,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -202893,8 +204098,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1271 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM1 + SolutionIndex: 1278 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -202903,19 +204108,19 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -202937,7 +204142,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -202945,32 +204150,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 8 LVCA: 16 LVCB: 32 - LVPA: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -202984,10 +204189,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -202995,14 +204200,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203060,28 +204265,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1272 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_WG4_16_4_WGM8 + SolutionIndex: 1279 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -203103,41 +204308,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203151,10 +204356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203164,9 +204369,11 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 @@ -203180,7 +204387,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -203225,33 +204432,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1273 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1280 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203270,41 +204475,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203318,10 +204523,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203331,10 +204536,12 @@ NonTemporalC: 0 NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203392,33 +204599,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1274 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1281 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203439,14 +204644,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -203456,22 +204661,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6720 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 2112 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 512 - LdsOffsetB_Blk: 4608 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -203485,10 +204690,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203496,14 +204701,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 PackFreeDims: 1 @@ -203561,20 +204766,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1275 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR0_TT4_4_USFGRO0_WG4_16_4_WGM8 + SolutionIndex: 1282 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -203582,7 +204787,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 @@ -203597,7 +204802,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -203605,7 +204810,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -203613,26 +204818,26 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -203640,7 +204845,7 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -203651,11 +204856,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203663,13 +204868,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 PackBatchDims: 0 @@ -203728,8 +204933,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1276 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1283 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -203738,21 +204943,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203764,13 +204969,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -203790,28 +204995,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -203819,10 +205020,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203830,15 +205031,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -203849,8 +205051,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -203895,20 +205097,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1277 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1284 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -203916,10 +205118,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -203931,13 +205133,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -203951,34 +205153,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -203986,10 +205184,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -203997,15 +205195,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 2 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204016,8 +205215,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -204062,31 +205261,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1278 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_WG16_4_4_WGM8 + SolutionIndex: 1285 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204098,7 +205297,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -204106,57 +205305,57 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -204164,15 +205363,16 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204184,7 +205384,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -204229,31 +205429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1279 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR0_TT2_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1286 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204274,7 +205474,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -204294,36 +205494,36 @@ LSCA: 64 LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -204331,8 +205531,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -204340,6 +205540,7 @@ NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 + OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 @@ -204396,15 +205597,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1280 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_WG16_4_4_WGM8 + SolutionIndex: 1287 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -204417,7 +205618,7 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -204438,38 +205639,42 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204496,12 +205701,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204514,7 +205717,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -204560,8 +205763,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1281 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1288 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204576,7 +205779,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -204585,6 +205788,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204596,44 +205801,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204646,7 +205855,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -204660,12 +205869,10 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204678,8 +205885,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -204724,8 +205931,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1282 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1289 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204740,7 +205947,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -204748,7 +205955,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -204768,40 +205977,40 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 LSPA: 16 - LSPB: 64 + LSPB: 32 LVCA: 16 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204815,9 +206024,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -204826,14 +206035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 + NumLoadsB: 2 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -204892,8 +206101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1283 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1290 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -204902,19 +206111,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -204928,48 +206137,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 LSPB: 64 - LVCA: 16 + LVCA: 48 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -204982,10 +206191,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 96 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 96 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -204994,13 +206203,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205060,8 +206267,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1284 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1291 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205070,21 +206277,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205103,7 +206312,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -205122,22 +206331,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205151,9 +206360,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205162,11 +206371,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205226,8 +206437,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1285 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1292 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205236,10 +206447,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -205248,11 +206459,9 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205264,16 +206473,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -205290,18 +206499,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -205318,10 +206527,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205330,8 +206539,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -205394,8 +206605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1286 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1293 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205404,10 +206615,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -205418,9 +206629,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205432,7 +206641,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -205440,7 +206649,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -205448,32 +206657,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205486,10 +206695,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205498,14 +206707,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -205564,8 +206773,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1287 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1294 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205574,21 +206783,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205600,48 +206809,48 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 + LSCA: 128 + LSCB: 16 + LSPA: 8 LSPB: 64 - LVCA: 48 + LVCA: 32 LVCB: 4 - LVPA: 3 - LVPB: 32 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -205654,10 +206863,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 96 + LoopUnroll: 16 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 96 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -205666,8 +206875,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -205730,8 +206941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1288 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_DTL0_EPS1_FL1_GRVW2_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1295 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205740,23 +206951,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205775,8 +206984,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -205784,11 +206993,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -205796,9 +207005,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -205835,12 +207044,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -205900,8 +207107,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1289 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW2_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1296 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -205920,11 +207127,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -205943,7 +207152,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -206004,8 +207213,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -206068,8 +207275,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1290 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1297 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206090,9 +207297,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206110,42 +207319,38 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 2 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -206159,10 +207364,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206170,8 +207375,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -206190,8 +207395,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -206236,8 +207441,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1291 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1298 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206246,17 +207451,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -206272,50 +207477,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 16 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 784 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206326,11 +207527,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206338,8 +207539,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -206358,8 +207559,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -206404,8 +207605,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1292 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_DTL0_EPS1_FL0_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1299 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206414,21 +207615,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 2] + ThreadTile0: 4 + ThreadTile1: 2 + ThreadTileA: 4 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206440,7 +207641,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206448,8 +207649,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -206460,30 +207661,30 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 + LSCA: 32 + LSCB: 16 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 - LVPB: 32 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206494,10 +207695,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 32 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 32 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -206506,8 +207707,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -206570,8 +207771,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1293 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1300 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206580,21 +207781,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -206608,50 +207809,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 2080 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206662,10 +207859,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -206674,12 +207871,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -206692,7 +207891,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -206738,8 +207937,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1294 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_DTL0_EPS1_FL1_GRVW4_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1301 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206748,23 +207947,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206776,7 +207973,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206796,26 +207993,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206826,11 +208023,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -206838,14 +208035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -206859,7 +208056,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -206904,8 +208101,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1295 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW2_WG16_16_1_WGM8 + SolutionIndex: 1302 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -206914,21 +208111,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -206940,7 +208137,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -206948,38 +208145,38 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 + LSCB: 32 + LSPA: 16 LSPB: 32 - LVCA: 64 + LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 784 + LdsNumElements: 4224 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -206990,11 +208187,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 32 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207002,14 +208199,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207023,7 +208220,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -207068,8 +208265,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1296 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR0_TT4_2_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1303 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -207078,21 +208275,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207110,9 +208307,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -207124,30 +208321,26 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 16 LSPA: 16 LSPB: 64 LVCA: 16 LVCB: 4 - LVPA: 8 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -207159,9 +208352,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -207170,7 +208363,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 + NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -207188,7 +208381,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207234,8 +208427,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1297 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x16_SE_DTL0_EPS1_FL1_GRVW4_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1304 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -207244,19 +208437,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 fractionalPerpOverhangA: 0 @@ -207298,35 +208491,35 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 16 - LSPA: 4 + LSPA: 8 LSPB: 16 - LVCA: 64 + LVCA: 32 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2080 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207334,14 +208527,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207400,29 +208593,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1298 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 + SolutionIndex: 1305 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -207442,55 +208635,59 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207498,14 +208695,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207518,7 +208715,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207564,15 +208761,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1299 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL0_GRVW1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1306 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -207580,12 +208777,12 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 @@ -207600,16 +208797,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -207627,34 +208824,38 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4224 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207662,14 +208863,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -207682,7 +208881,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207728,15 +208927,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1300 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_DTL0_EPS0_FL0_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1307 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -207749,10 +208948,12 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 2] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207764,15 +208965,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -207784,41 +208985,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 - LVPA: 4 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207826,8 +209031,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -207844,7 +209051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -207890,33 +209097,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1301 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_DTL0_EPS0_FL1_GRVW4_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1308 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -207928,13 +209133,13 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -207948,30 +209153,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 32 - LSCB: 16 + LSCB: 32 LSPA: 8 - LSPB: 16 + LSPB: 8 LVCA: 32 - LVCB: 16 + LVCB: 32 LVPA: 8 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -207980,9 +209189,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 16 + MacroTile1: 8 MacroTileA: 32 - MacroTileB: 16 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -207990,13 +209199,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 4 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208010,7 +209219,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -208056,15 +209265,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1302 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x16_SE_DTL0_EPS0_FL0_GRVW1_LPB0_PGR0_PLR1_TT2_2_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1309 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [2, 2] ThreadTile0: 2 @@ -208077,10 +209286,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208092,15 +209301,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -208108,26 +209317,26 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 576 LdsOffsetA: 0 @@ -208135,11 +209344,11 @@ LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208147,10 +209356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 16 + MacroTileA: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208158,13 +209367,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208224,31 +209431,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1303 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1310 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208260,14 +209469,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -208287,27 +209496,27 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 16 LVCA: 16 - LVCB: 8 + LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208316,9 +209525,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208326,11 +209535,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208390,15 +209601,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1304 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_DTL0_EPS1_FL1_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM8 + SolutionIndex: 1311 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 4 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -208411,16 +209622,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -208428,23 +209637,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -208454,28 +209663,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 - LSCB: 32 - LSPA: 32 + LSCA: 64 + LSCB: 8 + LSPA: 4 LSPB: 32 - LVCA: 8 + LVCA: 64 LVCB: 8 - LVPA: 32 - LVPB: 8 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208483,10 +209688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 - MacroTile1: 32 - MacroTileA: 8 - MacroTileB: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208494,14 +209699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -208514,8 +209719,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -208560,31 +209765,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1305 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_DTL0_EPS1_FL0_GRVW4_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1312 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208596,54 +209801,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 8 LSPA: 8 - LSPB: 8 + LSPB: 64 LVCA: 32 - LVCB: 32 - LVPA: 8 - LVPB: 8 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208651,10 +209856,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208662,13 +209867,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 4 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -208728,35 +209931,37 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1306 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_DTL0_EPS1_FL0_GRVW1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 + SolutionIndex: 1313 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -208764,54 +209969,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208819,10 +210024,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208830,12 +210035,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -208894,33 +210101,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1307 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_DTL0_EPS1_FL1_GRVW2_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1314 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -208932,7 +210137,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -208940,7 +210145,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -208948,38 +210153,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 4 + LSPB: 64 + LVCA: 64 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -208987,10 +210192,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -208998,8 +210203,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -209064,31 +210269,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1308 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_DTL0_EPS1_FL0_GRVW4_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1315 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 4, 4] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209126,18 +210331,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1024 + LdsNumElements: 512 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -209151,10 +210356,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209162,14 +210367,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209228,8 +210433,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1309 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1316 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209238,11 +210443,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209256,7 +210461,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209271,43 +210476,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 8 LSPA: 8 - LSPB: 64 + LSPB: 32 LVCA: 32 - LVCB: 4 - LVPA: 4 + LVCB: 8 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1544 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209319,10 +210524,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209330,8 +210535,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -209349,7 +210556,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209394,8 +210601,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1310 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1317 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209404,23 +210611,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209438,7 +210643,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -209458,24 +210663,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 32 LSCB: 8 - LSPA: 2 + LSPA: 8 LSPB: 32 - LVCA: 128 + LVCA: 32 LVCB: 8 - LVPA: 2 + LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 520 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209487,10 +210688,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209498,14 +210699,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 4 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209518,8 +210719,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209564,8 +210765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1311 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1318 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209574,11 +210775,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209592,7 +210793,7 @@ - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209600,50 +210801,46 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 32 + LVCB: 16 + LVPA: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1040 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -209654,11 +210851,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209666,14 +210863,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209686,8 +210883,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchGlobalRead: false + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -209732,8 +210929,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1312 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1319 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209742,21 +210939,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -209774,7 +210971,7 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 @@ -209803,9 +211000,13 @@ LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 512 + LdsNumElements: 1792 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 256 + LdsOffsetB_Blk: 1280 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -209820,9 +211021,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -209830,14 +211031,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -209850,7 +211051,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -209896,8 +211097,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1313 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB0_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1320 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -209906,11 +211107,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -209918,13 +211119,13 @@ VectorStore: true VectorWidth: 1 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -209952,7 +211153,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 @@ -209967,7 +211168,7 @@ LVPA: 8 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1544 + LdsNumElements: 2048 LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -209975,18 +211176,18 @@ LdsOffsetB: 256 LdsOffsetB_Blk: 1280 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 32 MacroTile1: 32 MacroTileA: 32 @@ -209999,7 +211200,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -210019,7 +211220,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210064,29 +211265,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1314 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB1_PGR1_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1321 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -210100,7 +211301,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -210117,7 +211318,7 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 @@ -210126,34 +211327,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 64 + LVCB: 32 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 520 + LdsNumElements: 4096 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetB: 2048 LdsPadA: 0 LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 16 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -210162,14 +211363,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210183,7 +211384,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210228,31 +211429,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1315 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM8 + SolutionIndex: 1322 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -210273,14 +211474,14 @@ ExpandPointerSwap: false FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -210290,34 +211491,34 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 32 - LVCB: 16 - LVPA: 8 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 16 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1040 + LdsNumElements: 2048 LdsOffsetA: 0 - LdsOffsetB: 512 + LdsOffsetB: 256 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -210326,14 +211527,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210347,7 +211548,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210392,20 +211593,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1316 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x16_SE_AMAS1_EPS0_FL0_GRVW1_GSU1_LPB1_PGR0_PLR0_TT2_2_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1323 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -210413,14 +211614,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 1 - WorkGroup: [16, 16, 1] + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210428,54 +211629,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 1792 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -210483,10 +211684,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -210494,14 +211695,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210515,7 +211714,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -210560,31 +211759,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1317 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x64x8_SE_AMAS1_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR0_TT2_4_USFGRO1_VW1_WG16_16_1_WGM1 + SolutionIndex: 1324 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -210596,7 +211797,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -210604,66 +211805,68 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 32 - LVCB: 8 - LVPA: 8 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 320 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 256 - LdsOffsetB_Blk: 1280 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -210712,6 +211915,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -210728,35 +211932,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1318 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SE_AMAS3_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_2_WGM8 + SolutionIndex: 1325 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210764,50 +211968,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 8 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 64 - LVCB: 32 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -210816,24 +212024,26 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -210846,7 +212056,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -210876,6 +212086,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -210892,15 +212103,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1319 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x32_SE_AMAS1_EPS0_FL0_GRVW1_GSU8_LPB1_PGR0_PLR1_TT4_4_USFGRO1_VW1_WG16_8_2_WGM1 + SolutionIndex: 1326 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -210908,19 +212119,19 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -210928,76 +212139,82 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 64 + LSCB: 32 LSPA: 16 LSPB: 32 LVCA: 16 LVCB: 8 - LVPA: 16 - LVPB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 2048 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 256 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 64 + MacroTileA: 64 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -211010,7 +212227,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -211040,6 +212257,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -211056,31 +212274,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1320 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x16_SE_AMAS1_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT4_2_USFGRO1_VW1_WG4_16_4_WGM1 + SolutionIndex: 1327 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 4 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211092,7 +212310,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -211101,7 +212319,7 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -211119,50 +212337,52 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 16 + LSPB: 64 LVCA: 16 - LVCB: 16 + LVCB: 4 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 + MaxVgprNumber: 256 MinGlobalWriteVectorWidth: 1 + MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -211206,6 +212426,7 @@ NumIndicesLD: 4 NumIndicesSummation: 1 OperationType: GEMM + SetConstStrideA: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -211222,15 +212443,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1321 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_AMAS3_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1328 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -211243,10 +212464,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -211267,7 +212488,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -211330,8 +212551,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211395,8 +212614,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1322 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1329 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211417,9 +212636,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211431,15 +212652,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211451,28 +212672,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211485,11 +212706,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211499,10 +212720,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211520,7 +212739,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -211566,8 +212785,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1323 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1330 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211576,21 +212795,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211602,15 +212823,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211622,28 +212843,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 8 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 8192 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 2048 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211656,11 +212877,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211670,14 +212891,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -211737,8 +212956,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1324 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1331 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211747,21 +212966,23 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -211773,7 +212994,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -211781,7 +213002,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -211793,28 +213014,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 LVPA: 4 - LVPB: 16 + LVPB: 24 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -211827,11 +213048,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 96 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -211841,8 +213062,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -211906,8 +213127,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1325 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1332 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -211916,21 +213137,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 6] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 6 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -211944,16 +213165,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -211970,18 +213191,18 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 128 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 - LVPA: 4 - LVPB: 16 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -211998,10 +213219,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 64 + LoopUnroll: 8 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -212012,8 +213233,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212031,7 +213254,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -212077,8 +213300,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1326 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1333 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212087,10 +213310,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -212099,11 +213322,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212122,9 +213343,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212135,28 +213356,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212170,10 +213391,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212183,8 +213404,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212248,8 +213471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1327 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1334 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212258,23 +213481,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212293,9 +213514,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212306,28 +213527,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212341,10 +213562,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212354,8 +213575,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212419,8 +213642,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1328 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1335 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212429,23 +213652,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212464,9 +213685,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212477,28 +213698,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 96 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212512,10 +213733,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -212525,8 +213746,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -212590,8 +213813,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1329 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1336 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212600,23 +213823,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212628,7 +213849,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -212637,7 +213858,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -212655,21 +213876,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -212682,7 +213903,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -212698,11 +213919,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -212717,7 +213938,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -212763,8 +213984,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1330 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1337 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212787,7 +214008,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212806,8 +214027,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -212815,11 +214036,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -212827,9 +214048,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -212868,12 +214089,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -212888,7 +214107,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -212934,8 +214153,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1331 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1338 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -212954,11 +214173,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -212977,7 +214198,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -213040,8 +214261,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213059,7 +214278,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -213105,8 +214324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1332 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1339 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213130,6 +214349,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213148,7 +214369,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -213211,8 +214432,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213230,7 +214449,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -213276,8 +214495,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1333 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1340 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213301,6 +214520,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213312,16 +214533,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -213339,183 +214560,12 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 - LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1024 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 - MinVgprNumber: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 - NumThreads: 256 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 0 - DestDataType: 0 - HighPrecisionAccumulate: false - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SilentHighPrecisionAccumulate: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: false - UseBeta: true - UseInitialStrides: false - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1334 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 - UnrollMemFence: false - UseSgprForGRO: false - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 - WorkGroupMappingType: B - _staggerStrideShift: 2 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 - LVCB: 4 - LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 @@ -213552,10 +214602,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -213616,8 +214666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1335 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1341 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213636,9 +214686,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -213741,7 +214791,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -213787,8 +214837,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1336 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1342 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -213809,7 +214859,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -213825,16 +214875,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -213851,39 +214901,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -213893,8 +214943,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -213912,7 +214964,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -213958,20 +215010,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1337 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1343 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -213979,12 +215031,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -213996,14 +215046,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -214022,28 +215072,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214051,10 +215101,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214064,11 +215114,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -214129,20 +215181,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1338 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1344 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -214150,12 +215202,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214167,7 +215217,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214193,28 +215243,28 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 64 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214222,10 +215272,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214235,11 +215285,11 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -214300,20 +215350,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1339 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1345 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 4 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -214321,10 +215371,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 @@ -214338,7 +215388,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214346,7 +215396,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -214354,49 +215404,49 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214406,14 +215456,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214463,6 +215513,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214473,15 +215524,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1340 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1346 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214489,15 +215540,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214509,7 +215560,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214517,46 +215568,46 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 8 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3104 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -214565,9 +215616,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214577,14 +215628,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214634,6 +215685,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214644,15 +215696,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1341 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1347 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214660,15 +215712,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214680,65 +215732,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 32 - LSPA: 16 + LSCB: 16 + LSPA: 4 LSPB: 16 - LVCA: 16 + LVCA: 64 LVCB: 16 LVPA: 4 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 2112 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 16 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 16 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -214748,12 +215796,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214766,7 +215816,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -214803,6 +215853,7 @@ Tensor0: 0 Tensor1: 1 TileA: 0 + TileAwareSelection: false TileB: 1 TotalIndices: 4 TransposeA: false @@ -214813,15 +215864,15 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1342 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1348 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false ThreadTile: [4, 4] ThreadTile0: 4 @@ -214829,17 +215880,15 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] + WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -214851,7 +215900,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -214859,42 +215908,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -214905,7 +215954,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -214921,12 +215970,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -214987,8 +216036,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1343 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1349 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215003,7 +216052,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -215011,7 +216060,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215023,7 +216072,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -215031,42 +216080,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3104 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215077,7 +216126,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -215093,12 +216142,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -215159,8 +216208,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1344 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1350 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215175,7 +216224,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -215183,7 +216232,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215203,30 +216252,30 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 64 - LVCB: 16 + LSPA: 16 + LSPB: 64 + LVCA: 16 + LVCB: 4 LVPA: 4 LVPB: 16 LdcEqualsLdd: false @@ -215261,12 +216310,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -215327,8 +216376,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1345 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1351 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215343,13 +216392,13 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -215398,15 +216447,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 + LdsNumElements: 6208 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 + LdsOffsetA_Blk: 4096 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB_Blk: 5120 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215499,8 +216548,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1346 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1352 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215521,7 +216570,7 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -215535,50 +216584,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCB: 8 + LSPA: 8 LSPB: 64 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3088 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215589,7 +216638,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 64 MacroTileA: 64 @@ -215604,9 +216653,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -215671,8 +216718,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1347 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1353 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215691,11 +216738,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215713,8 +216762,8 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -215742,11 +216791,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2112 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215773,8 +216826,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -215791,7 +216842,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -215839,8 +216890,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1348 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1354 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -215864,6 +216915,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -215882,7 +216935,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -215910,15 +216963,15 @@ LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6208 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 + LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 5120 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -215945,8 +216998,6 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -216011,8 +217062,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1349 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1355 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216036,6 +217087,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216054,43 +217107,43 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3088 + LdsNumElements: 3616 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216103,9 +217156,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216115,12 +217168,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216134,7 +217189,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -216181,8 +217236,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1350 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1356 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216191,23 +217246,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216219,50 +217272,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216273,11 +217326,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216287,12 +217340,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216306,7 +217361,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -216353,8 +217408,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1351 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1357 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216363,23 +217418,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216397,44 +217450,40 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 64 + LVCB: 16 LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3136 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216447,9 +217496,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216459,12 +217508,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216477,7 +217528,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -216525,8 +217576,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1352 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1358 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216535,13 +217586,13 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -216550,8 +217601,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -216571,42 +217620,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 4 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 512 LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216632,13 +217681,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216652,7 +217701,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -216699,8 +217748,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1353 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1359 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216715,11 +217764,11 @@ ThreadTileA: 4 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -216743,42 +217792,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 4 - LSPB: 32 + LSPB: 64 LVCA: 64 - LVCB: 8 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216790,10 +217839,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216804,13 +217853,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -216824,7 +217873,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -216871,8 +217920,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1354 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB4_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1360 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -216881,19 +217930,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -216915,38 +217964,38 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 16 LSPA: 4 - LSPB: 16 + LSPB: 32 LVCA: 64 - LVCB: 16 - LVPA: 4 + LVCB: 8 + LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3136 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -216958,10 +218007,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -216972,13 +218021,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -217039,8 +218088,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1355 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x16_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1361 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217049,19 +218098,19 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 - AggressivePerfMode: 1 @@ -217083,8 +218132,8 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -217095,28 +218144,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 - LSPB: 128 + LSPB: 64 LVCA: 32 - LVCB: 2 - LVPA: 4 + LVCB: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false LdsNumElements: 3584 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1024 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -217130,10 +218179,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -217144,7 +218193,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 @@ -217164,7 +218213,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -217211,8 +218260,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1356 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1362 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217221,17 +218270,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -217255,7 +218304,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -217263,11 +218312,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -217275,9 +218324,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -217316,12 +218365,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -217383,8 +218432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1357 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1363 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217403,7 +218452,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -217419,15 +218468,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -217435,28 +218484,32 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 - LSPA: 4 - LSPB: 32 - LVCA: 64 - LVCB: 8 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3072 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 2048 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -217469,7 +218522,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -217484,13 +218537,13 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -217503,7 +218556,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -217551,8 +218604,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1358 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1364 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217571,11 +218624,11 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -217622,15 +218675,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217723,8 +218776,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1359 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1365 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217794,15 +218847,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217895,8 +218948,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1360 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1366 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -217931,7 +218984,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -217940,7 +218993,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -217958,23 +219011,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 7232 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -217985,7 +219038,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -218001,11 +219054,11 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218067,8 +219120,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1361 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1367 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218091,7 +219144,7 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218110,8 +219163,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218119,11 +219172,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218131,22 +219184,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218172,12 +219225,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218192,7 +219243,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -218239,8 +219290,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1362 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1368 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218259,11 +219310,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218282,8 +219335,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218291,11 +219344,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218303,22 +219356,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218344,12 +219397,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218411,8 +219462,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1363 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1369 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218431,11 +219482,13 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218447,16 +219500,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -218474,23 +219527,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 16 + LSCB: 8 LSPA: 8 LSPB: 64 LVCA: 32 LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 7232 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218501,7 +219554,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 128 MacroTile1: 64 MacroTileA: 128 @@ -218517,11 +219570,9 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218583,8 +219634,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1364 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1370 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218605,9 +219656,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -218627,7 +219680,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218635,11 +219688,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218647,9 +219700,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -218688,10 +219741,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218753,8 +219806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1365 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1371 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218773,9 +219826,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -218799,7 +219852,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -218807,11 +219860,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -218819,22 +219872,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -218860,10 +219913,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -218925,8 +219978,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1366 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1372 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -218945,9 +219998,9 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -218970,37 +220023,37 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSPA: 2 + LSPB: 32 + LVCA: 128 + LVCB: 8 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -219019,9 +220072,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219031,12 +220084,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219097,8 +220152,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1367 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1373 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219107,23 +220162,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219142,8 +220195,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219151,11 +220204,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -219163,16 +220216,16 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -219191,9 +220244,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219203,12 +220256,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219269,8 +220324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1368 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1374 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219279,23 +220334,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219315,7 +220368,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219323,11 +220376,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -219335,22 +220388,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219363,9 +220416,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219375,12 +220428,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219394,7 +220447,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -219441,8 +220494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1369 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1375 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219451,17 +220504,17 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] + ThreadTile: [8, 8] ThreadTile0: 8 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 8 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B @@ -219486,31 +220539,31 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 2 - LSPB: 32 - LVCA: 128 - LVCB: 8 + LSPA: 8 + LSPB: 128 + LVCA: 32 + LVCB: 2 LVPA: 2 LVPB: 32 LdcEqualsLdd: false @@ -219549,12 +220602,10 @@ NonTemporalC: 0 NumElementsPerThread: 64 NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -219615,8 +220666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1370 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW1_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 + SolutionIndex: 1376 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -219631,7 +220682,7 @@ ThreadTileA: 8 ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -219640,6 +220691,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -219659,7 +220712,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219667,34 +220720,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219706,10 +220759,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219719,15 +220772,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 - NumLoadsA: 2 - NumLoadsB: 2 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -219740,7 +220793,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -219787,28 +220840,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1371 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1377 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -219831,7 +220884,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -219839,34 +220892,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 - LSPA: 4 - LSPB: 64 - LVCA: 64 + LSPA: 8 + LSPB: 32 + LVCA: 16 LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 1824 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -219878,10 +220931,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -219891,13 +220944,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 32 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 256 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -219957,28 +221010,28 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1372 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1378 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] + VectorWidth: 4 + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 @@ -220001,10 +221054,10 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -220021,24 +221074,20 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 128 - LVCA: 32 - LVCB: 2 + LSPB: 32 + LVCA: 16 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 800 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220050,10 +221099,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 128 - MacroTileA: 128 - MacroTileB: 128 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220063,13 +221112,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 64 - NumGlobalWriteVectorsPerThread: 16 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -220081,7 +221130,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -220129,29 +221178,29 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1373 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 + SolutionIndex: 1379 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 8] - ThreadTile0: 8 - ThreadTile1: 8 - ThreadTileA: 8 - ThreadTileB: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -220175,19 +221224,19 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -220195,22 +221244,22 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 - LSPB: 32 - LVCA: 16 - LVCB: 4 + LSPA: 4 + LSPB: 16 + LVCA: 32 + LVCB: 8 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1824 + LdsNumElements: 1680 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 192 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220223,9 +221272,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 16 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220235,13 +221284,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 128 OptNoLoadLoop: 1 @@ -220256,7 +221305,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -220303,8 +221352,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1374 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1380 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220313,19 +221362,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -220346,9 +221395,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -220368,21 +221417,21 @@ LSCA: 64 LSCB: 8 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 16 - LVCB: 4 + LVCB: 2 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1824 + LdsNumElements: 2048 LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 1024 LdsOffsetB: 512 LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220395,9 +221444,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220407,8 +221456,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -220473,8 +221524,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1375 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR0_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1381 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220483,11 +221534,11 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -220495,11 +221546,9 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -220517,9 +221566,9 @@ DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -220527,11 +221576,11 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -220539,18 +221588,22 @@ KernelLanguage: Assembly LSCA: 64 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 32 - LVCA: 16 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 800 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 + LdsOffsetA_Blk: 1024 LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -220563,9 +221616,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 64 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220575,12 +221628,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220593,8 +221646,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchGlobalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -220641,8 +221694,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1376 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_GSU1_LPB4_PGR0_PLR1_TT4_4_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1382 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220651,19 +221704,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 @@ -220680,14 +221733,14 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -220695,32 +221748,28 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 1 LSPB: 16 - LVCA: 32 + LVCA: 128 LVCB: 8 - LVPA: 2 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1680 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 192 + LdsNumElements: 1296 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -220729,15 +221778,15 @@ LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220747,14 +221796,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220767,7 +221816,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -220815,8 +221864,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1377 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x8_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG16_8_1_WGM1 + SolutionIndex: 1383 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220825,19 +221874,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -220852,64 +221901,60 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 16 - LVCB: 2 - LVPA: 2 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -220921,12 +221966,12 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -220939,8 +221984,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -220987,8 +222032,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1378 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM8 + SolutionIndex: 1384 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -220997,19 +222042,19 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -221024,64 +222069,60 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: false - DirectToLdsA: false + DirectToLds: true + DirectToLdsA: true DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 - LSPB: 32 - LVCA: 32 - LVCB: 4 - LVPA: 2 + LSPA: 1 + LSPB: 16 + LVCA: 128 + LVCB: 8 + LVPA: 1 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 512 + LdsNumElements: 1312 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetB: 1024 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: false + LocalWriteUseSgprA: true LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 64 - MacroTileA: 64 - MacroTileB: 64 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221092,10 +222133,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 @@ -221109,8 +222152,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -221157,8 +222200,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1379 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR0_TT4_8_USFGRO0_VW2_WG16_8_1_WGM8 + SolutionIndex: 1385 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -221167,23 +222210,21 @@ SubGroupA: 16 SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 8, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221195,61 +222236,61 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1296 + LdsNumElements: 2560 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetB: 1536 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221259,15 +222300,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221280,7 +222321,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: false - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -221327,31 +222368,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1380 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB2_PGR0_PLR1_TT8_4_USFGRO1_VW2_WG16_8_1_WGM8 + SolutionIndex: 1386 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221363,61 +222404,65 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DepthU: 16 + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 + LSCA: 32 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 LVCB: 8 - LVPA: 1 + LVPA: 8 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 1312 + LdsNumElements: 6656 + LdsNumElementsAlignedA: 1536 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1536 + LdsOffsetB_Blk: 5632 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221427,15 +222472,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 + NumLoadsA: 3 NumLoadsB: 2 - NumLoadsCoalescedA: 1 + NumLoadsCoalescedA: 3 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221447,7 +222492,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -221495,31 +222540,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1381 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM1 + SolutionIndex: 1387 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] + VectorWidth: 2 + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221532,60 +222577,64 @@ CheckDimOverflow: 0 CheckTensorDimAsserts: false DepthU: 8 - DirectToLds: true - DirectToLdsA: true + DirectToLds: false + DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 96 LSCB: 8 - LSPA: 1 - LSPB: 16 - LVCA: 128 - LVCB: 8 - LVPA: 1 - LVPB: 16 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 1312 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1024 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true - LocalWriteUseSgprA: true + LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 32 - MacroTileA: 128 - MacroTileB: 32 + MacroTile0: 96 + MacroTile1: 64 + MacroTileA: 96 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221595,15 +222644,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 2 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -221615,7 +222662,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -221663,31 +222710,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1382 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_EPS0_FL0_GRVW1_GSU1_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_8_1_WGM8 + SolutionIndex: 1388 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 8 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [6, 4] + ThreadTile0: 6 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 6 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221699,14 +222748,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 + ExpandPointerSwap: true + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -221725,18 +222774,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 96 + LSCB: 8 + LSPA: 5 + LSPB: 64 + LVCA: 48 + LVCB: 4 + LVPA: 3 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2560 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 768 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetB: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 768 + LdsOffsetB_Blk: 2816 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -221749,7 +222802,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 96 MacroTile1: 64 MacroTileA: 96 @@ -221765,12 +222818,10 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -221783,8 +222834,8 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -221831,8 +222882,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1383 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS0_FL0_GRVW2_GSU1_LPB0_PGR0_PLR0_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1389 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -221853,9 +222904,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 3 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -221867,14 +222920,14 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -221893,22 +222946,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 8 - LVPB: 16 + LSCA: 64 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 4 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 6656 - LdsNumElementsAlignedA: 1536 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3328 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1536 - LdsOffsetB_Blk: 5632 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -221921,11 +222974,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -221937,9 +222990,7 @@ NonTemporalC: 0 NumElementsPerThread: 24 NumGlobalWriteVectorsPerThread: 12 - NumLoadsA: 3 - NumLoadsB: 2 - NumLoadsCoalescedA: 3 + NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 2 @@ -222003,8 +223054,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1384 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1390 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -222013,11 +223064,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -222025,9 +223076,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 32 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222048,14 +223101,14 @@ ExpandPointerSwap: true FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -222065,22 +223118,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 + LSCA: 64 LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSPA: 8 + LSPB: 96 + LVCA: 32 + LVCB: 2 + LVPA: 4 + LVPB: 24 LdcEqualsLdd: false LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 768 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -222094,10 +223147,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 96 + MacroTileA: 64 + MacroTileB: 96 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222111,7 +223164,7 @@ NumGlobalWriteVectorsPerThread: 12 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -222173,8 +223226,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1385 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM1 + SolutionIndex: 1391 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -222183,11 +223236,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 - ThreadTile1: 4 - ThreadTileA: 6 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -222195,10 +223248,10 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 + fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 @@ -222211,15 +223264,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222227,38 +223280,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 96 - LSCB: 8 - LSPA: 5 - LSPB: 64 - LVCA: 48 - LVCB: 4 - LVPA: 3 - LVPB: 32 + LSCA: 64 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 768 - LdsNumElementsAlignedB: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 768 - LdsOffsetB_Blk: 2816 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222266,10 +223319,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 96 - MacroTile1: 64 - MacroTileA: 96 - MacroTileB: 64 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222279,11 +223332,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -222345,33 +223400,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1386 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT6_4_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1392 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [6, 4] - ThreadTile0: 6 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 6 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 3 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222383,7 +223436,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -222391,7 +223444,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222399,38 +223452,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 16 + LSPB: 32 + LVCA: 16 + LVCB: 8 LVPA: 4 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222439,9 +223492,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 96 + MacroTile1: 32 MacroTileA: 64 - MacroTileB: 96 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222451,12 +223504,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222517,33 +223570,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1387 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW2_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1393 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 6] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 6 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 6 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 32 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222555,54 +223608,54 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 8 - LSPA: 8 - LSPB: 96 - LVCA: 32 - LVCB: 2 - LVPA: 4 - LVPB: 24 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3328 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 768 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222610,10 +223663,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 96 - MacroTileA: 64 - MacroTileB: 96 + MacroTile0: 8 + MacroTile1: 32 + MacroTileA: 8 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -222623,12 +223676,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 12 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222689,33 +223744,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1388 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x96x8_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT4_6_USFGRO0_VW2_WG16_16_1_WGM8 + SolutionIndex: 1394 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 4 SubGroup1: 16 - SubGroupA: 16 + SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222727,7 +223780,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -222735,7 +223788,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 1 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -222743,38 +223796,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 32 - LVCA: 16 - LVCB: 8 - LVPA: 4 - LVPB: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 16 + LVCA: 8 + LVCB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222782,9 +223835,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -222795,14 +223848,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -222863,31 +223916,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1389 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1395 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] - WorkGroupMapping: 1 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -222899,16 +223952,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -222919,34 +223972,34 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 16 - LSPA: 16 + LSCA: 8 + LSCB: 32 + LSPA: 32 LSPB: 32 - LVCA: 16 + LVCA: 8 LVCB: 8 - LVPA: 4 - LVPB: 16 + LVPA: 32 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 256 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 2 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -222954,9 +224007,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 8 MacroTile1: 32 - MacroTileA: 64 + MacroTileA: 8 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -222967,8 +224020,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -223033,33 +224088,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1390 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_8_2_WGM1 + SolutionIndex: 1396 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 8, 2] + VectorWidth: 2 + WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223080,14 +224133,14 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -223100,9 +224153,9 @@ LSCA: 8 LSCB: 32 LSPA: 32 - LSPB: 8 + LSPB: 32 LVCA: 8 - LVCB: 32 + LVCB: 8 LVPA: 32 LVPB: 8 LdcEqualsLdd: false @@ -223142,11 +224195,11 @@ NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 1 - NumLoadsB: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223207,8 +224260,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1391 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1397 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223229,7 +224282,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -223252,39 +224305,39 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 32 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223298,9 +224351,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223311,14 +224364,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223379,8 +224432,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1392 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 1398 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223389,10 +224442,10 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 @@ -223401,7 +224454,7 @@ VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - AggressivePerfMode: 1 @@ -223422,41 +224475,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 - LSPB: 32 + LSPB: 16 LVCA: 8 - LVCB: 8 - LVPA: 32 + LVCB: 16 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223470,9 +224523,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223483,14 +224536,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223551,8 +224602,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1393 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1399 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223561,21 +224612,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223594,8 +224647,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true @@ -223607,28 +224660,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 8 + LSCA: 16 LSCB: 32 LSPA: 32 LSPB: 32 LVCA: 8 LVCB: 8 - LVPA: 32 + LVPA: 16 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 - LdsNumElementsAlignedA: 256 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223642,9 +224695,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 8 + MacroTile0: 16 MacroTile1: 32 - MacroTileA: 8 + MacroTileA: 16 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -223655,10 +224708,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -223723,8 +224774,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1394 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM8 + SolutionIndex: 1400 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223733,21 +224784,23 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 + ThreadTile: [4, 2] + ThreadTile0: 4 ThreadTile1: 2 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223759,7 +224812,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -223768,15 +224821,15 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 + GlobalReadVectorWidth: 2 + GlobalSplitU: 8 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 2 @@ -223786,21 +224839,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 16 - LSCB: 32 + LSCB: 16 LSPA: 16 - LSPB: 8 + LSPB: 32 LVCA: 16 - LVCB: 32 + LVCB: 8 LVPA: 16 - LVPB: 8 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 256 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 256 + LdsOffsetB_Blk: 2304 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223813,11 +224866,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 4 MacroTile0: 16 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 16 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -223827,14 +224880,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 1 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 2 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -223895,8 +224948,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1395 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1401 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -223905,11 +224958,11 @@ SubGroupA: 4 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -223919,7 +224972,7 @@ WorkGroup: [4, 16, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -223938,9 +224991,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -223951,28 +225004,28 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 16 - LVCA: 8 - LVCB: 16 - LVPA: 16 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -223986,10 +225039,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -223999,12 +225052,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -224065,20 +225120,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1396 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 + SolutionIndex: 1402 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -224086,12 +225141,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224110,41 +225163,41 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 + LSCA: 32 LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 16 + LSPA: 16 + LSPB: 8 + LVCA: 16 + LVCB: 32 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -224158,10 +225211,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 16 - MacroTile1: 32 - MacroTileA: 16 - MacroTileB: 32 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224171,11 +225224,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 1 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -224237,20 +225292,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1397 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM1 + SolutionIndex: 1403 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 ThreadTile1: 2 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false @@ -224258,12 +225313,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] - WorkGroupMapping: 1 + WorkGroup: [16, 4, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224275,15 +225328,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -224292,31 +225345,31 @@ GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true GlobalReadVectorWidth: 2 - GlobalSplitU: 8 + GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 - LSPB: 32 + LSPB: 8 LVCA: 16 - LVCB: 8 - LVPA: 16 - LVPB: 16 + LVCB: 16 + LVPA: 8 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 256 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3392 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 256 - LdsOffsetB_Blk: 2304 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 2 LocalDotLayout: 1 @@ -224329,11 +225382,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 + LoopUnroll: 8 + MacroTile0: 32 + MacroTile1: 8 + MacroTileA: 32 + MacroTileB: 8 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224343,14 +225396,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 1 - NumLoadsB: 2 + NumElementsPerThread: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -224411,31 +225462,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1398 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW2_GSU8_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG4_16_4_WGM1 + SolutionIndex: 1404 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 4 - SubGroup1: 16 - SubGroupA: 4 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 4 + SubGroupA: 16 + SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 - ThreadTile1: 4 - ThreadTileA: 4 - ThreadTileB: 4 + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [4, 16, 4] + WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224456,7 +225509,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -224467,7 +225520,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -224476,15 +225529,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -224503,9 +225556,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224515,7 +225568,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsA: 2 NumLoadsB: 1 @@ -224583,8 +225636,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1399 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1405 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224593,11 +225646,11 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -224626,9 +225679,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -224639,7 +225692,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 @@ -224648,15 +225701,15 @@ LSCA: 32 LSCB: 32 LSPA: 16 - LSPB: 8 + LSPB: 16 LVCA: 16 - LVCB: 32 + LVCB: 16 LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 3648 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 @@ -224675,9 +225728,9 @@ LoopTail: true LoopUnroll: 8 MacroTile0: 32 - MacroTile1: 8 + MacroTile1: 16 MacroTileA: 32 - MacroTileB: 8 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224687,10 +225740,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -224755,8 +225806,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1400 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM8 + SolutionIndex: 1406 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224765,11 +225816,11 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] + ThreadTile: [2, 4] ThreadTile0: 2 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 2 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true @@ -224777,9 +225828,11 @@ VectorStore: true VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224791,42 +225844,42 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 8 - LVCA: 16 + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 64 LVCB: 16 - LVPA: 8 - LVPB: 4 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3392 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 320 LdsOffsetA: 0 @@ -224845,11 +225898,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 8 - MacroTileA: 32 - MacroTileB: 8 + LoopUnroll: 4 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -224859,11 +225912,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 1 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 4 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -224925,8 +225980,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1401 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x8x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_2_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1407 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -224935,23 +225990,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 2] - ThreadTile0: 2 - ThreadTile1: 2 - ThreadTileA: 2 - ThreadTileB: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -224963,7 +226016,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -224971,42 +226024,42 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 + LSCA: 64 + LSCB: 16 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 4096 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225017,10 +226070,10 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 32 + LoopUnroll: 4 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225031,13 +226084,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 + NumLoadsA: 1 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225099,8 +226152,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1402 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1408 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225109,21 +226162,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225142,8 +226195,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225151,34 +226204,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 + LSCA: 64 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 8 + LVPA: 4 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3648 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225190,9 +226243,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 + MacroTile0: 64 MacroTile1: 16 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225203,8 +226256,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 + NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -225269,8 +226324,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1403 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU1_LPB2_PGR1_PLR1_TT2_4_USFGRO0_VW2_WG16_4_4_WGM1 + SolutionIndex: 1409 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225279,23 +226334,21 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225307,50 +226360,50 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 4 + LSCB: 32 + LSPA: 16 LSPB: 16 - LVCA: 64 + LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225361,7 +226414,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -225376,12 +226429,10 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 2 - NumLoadsA: 4 - NumLoadsB: 1 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225443,8 +226494,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1404 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW1_GSU1_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_4_4_WGM8 + SolutionIndex: 1410 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225459,15 +226510,17 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225479,16 +226532,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -225506,21 +226559,21 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 LVPA: 4 - LVPB: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 6784 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 640 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -225533,7 +226586,7 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 4 + LoopUnroll: 8 MacroTile0: 64 MacroTile1: 16 MacroTileA: 64 @@ -225549,11 +226602,9 @@ NonTemporalC: 0 NumElementsPerThread: 4 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -225615,8 +226666,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1405 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1411 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225639,7 +226690,9 @@ WorkGroup: [16, 4, 4] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225658,8 +226711,8 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225667,34 +226720,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225706,9 +226759,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225719,10 +226772,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 - NumLoadsA: 2 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -225783,12 +226834,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1406 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1412 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225797,21 +226850,23 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 4, 4] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -225831,7 +226886,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -225839,34 +226894,34 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 32 LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 16 LVCB: 16 - LVPA: 4 + LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6784 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -225878,9 +226933,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 32 MacroTile1: 16 - MacroTileA: 64 + MacroTileA: 32 MacroTileB: 16 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -225891,7 +226946,7 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 + NumElementsPerThread: 2 NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 @@ -225953,12 +227008,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1407 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM1 + SolutionIndex: 1413 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -225967,19 +227024,19 @@ SubGroupA: 16 SubGroupB: 4 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [2, 4] + ThreadTile0: 2 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 2 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 1 fractionalPerpOverhangA: 0 @@ -226125,12 +227182,14 @@ TransposeB: false UseBeta: true UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1408 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU1_LPB4_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG16_4_4_WGM8 + SolutionIndex: 1414 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -226167,15 +227226,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -226183,38 +227242,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -226222,10 +227281,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226235,11 +227294,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -226254,7 +227315,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -226303,33 +227364,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1409 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM1 + SolutionIndex: 1415 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226341,15 +227400,15 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -226357,38 +227416,38 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSCA: 128 + LSCB: 8 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3648 + LdsNumElements: 3584 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 2 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -226396,10 +227455,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 32 - MacroTile1: 16 - MacroTileA: 32 - MacroTileB: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226409,11 +227468,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 2 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -226477,33 +227538,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1410 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR1_TT2_4_USFGRO0_VW2_WGM8 + SolutionIndex: 1416 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226515,16 +227574,16 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -226541,39 +227600,39 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 8 + LSCA: 128 + LSCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 + LVPA: 2 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6784 + LdsNumElements: 7168 LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 640 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 4 + LocalSplitU: 1 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -226583,8 +227642,10 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 4 - NumGlobalWriteVectorsPerThread: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -226651,20 +227712,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1411 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1417 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 4 + SubGroup1: 16 SubGroupA: 16 - SubGroupB: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -226672,12 +227733,10 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 4, 4] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226696,7 +227755,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -226759,8 +227818,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -226827,8 +227884,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1412 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1418 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -226852,6 +227909,8 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -226870,7 +227929,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -226933,8 +227992,6 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -227001,8 +228058,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1413 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1419 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227023,9 +228080,11 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -227044,7 +228103,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -227060,25 +228119,26 @@ GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 16 - LSPA: 8 + LSPA: 16 LSPB: 64 - LVCA: 32 + LVCA: 16 LVCB: 4 - LVPA: 2 + LVPA: 4 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 7168 - LdsNumElementsAlignedA: 2048 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 2048 - LdsOffsetB_Blk: 6144 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -227092,9 +228152,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -227105,21 +228165,21 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 1 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227175,8 +228235,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1414 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1420 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227185,10 +228245,10 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 8 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -227200,6 +228260,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -227219,7 +228281,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthA: 2 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -227227,21 +228289,22 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 8 + LSPA: 4 LSPB: 64 - LVCA: 32 + LVCA: 64 LVCB: 4 LVPA: 2 LVPB: 32 @@ -227280,18 +228343,20 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227347,8 +228412,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1415 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1421 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227367,7 +228432,7 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -227408,6 +228473,7 @@ GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: false GuaranteeNoPartialB: true + ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly @@ -227464,8 +228530,10 @@ PackBatchDims: 0 PackFreeDims: 1 PackGranularity: 2 - PackedC0Indices: [I] - PackedC1Indices: [J] + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] PerformanceSyncLocation: -1 PerformanceWaitCount: -1 PerformanceWaitLocation: -1 @@ -227521,8 +228589,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1416 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM8 + SolutionIndex: 1422 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227543,13 +228611,13 @@ VectorStore: true VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227580,7 +228648,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227623,7 +228691,6 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 @@ -227698,8 +228765,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1417 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT4_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1423 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227714,7 +228781,7 @@ ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -227726,7 +228793,7 @@ fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227744,7 +228811,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 1 - GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true @@ -227752,12 +228819,12 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227765,9 +228832,9 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 64 - LVCA: 64 + LVCA: 32 LVCB: 4 LVPA: 2 LVPB: 32 @@ -227800,16 +228867,15 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -227826,7 +228892,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -227875,8 +228941,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1418 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_NLCA1_PGR1_PLR0_TT8_4_USFGRO0_VW2_WGM8 + SolutionIndex: 1424 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -227891,19 +228957,19 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [16, 16, 1] - WorkGroupMapping: 8 + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 + AssertFree0ElementMultiple: 4 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -227934,7 +229000,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -227949,15 +229015,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -227977,7 +229043,6 @@ MacroTileShapeMin: 1 MaxOccupancy: 40 MaxVgprNumber: 256 - MinGlobalWriteVectorWidth: 1 MinVgprNumber: 0 NonTemporalA: 0 NonTemporalB: 0 @@ -228052,8 +229117,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1419 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_NLCA1_PGR1_PLR1_TT8_4_USFGRO0_VW4_WGM1 + SolutionIndex: 1425 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228068,7 +229133,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -228080,7 +229145,7 @@ fractionalPerpOverhangA: 0 fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228090,23 +229155,23 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 8 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -228118,23 +229183,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 64 - LSCB: 16 - LSPA: 16 - LSPB: 64 - LVCA: 16 - LVCB: 4 + LSCB: 8 + LSPA: 4 + LSPB: 32 + LVCA: 64 + LVCB: 8 LVPA: 4 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 4096 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 1024 + LdsNumElements: 3616 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228145,11 +229210,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 8 MacroTile0: 64 - MacroTile1: 64 + MacroTile1: 128 MacroTileA: 64 - MacroTileB: 64 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228158,12 +229223,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228228,8 +229295,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1420 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x16_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1426 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW1_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228238,11 +229305,11 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 8] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 8 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -228252,11 +229319,9 @@ WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 3 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228273,7 +229338,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 2 GlobalRead2A: true @@ -228287,7 +229352,7 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -228302,15 +229367,15 @@ LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3584 + LdsNumElements: 3616 LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 1024 LdsOffsetB_Blk: 3072 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228336,6 +229401,8 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -228355,7 +229422,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -228404,8 +229471,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1421 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1427 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -228420,7 +229487,7 @@ ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true @@ -228429,10 +229496,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 4 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 @@ -228449,9 +229514,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 + FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -228463,30 +229528,30 @@ GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 + LSCA: 64 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 32 - LVCB: 4 + LVCA: 16 + LVCB: 2 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2048 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 1024 + LdsOffsetB: 512 + LdsOffsetB_Blk: 1536 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228498,9 +229563,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 + MacroTile0: 64 MacroTile1: 64 - MacroTileA: 128 + MacroTileA: 64 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -228512,11 +229577,13 @@ NonTemporalC: 0 NumElementsPerThread: 32 NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 256 + NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -228531,7 +229598,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: true + PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true Batched: true @@ -228580,33 +229647,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1422 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1428 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 SubGroup0: 16 - SubGroup1: 16 + SubGroup1: 8 SubGroupA: 16 - SubGroupB: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] + WorkGroup: [16, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 3 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -228626,7 +229691,7 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true @@ -228634,33 +229699,33 @@ GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 - LSPA: 4 + LSPA: 8 LSPB: 32 - LVCA: 64 + LVCA: 32 LVCB: 8 - LVPA: 4 + LVPA: 2 LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 512 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 3360 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 320 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 512 - LdsOffsetB_Blk: 2560 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 4 LocalDotLayout: 1 @@ -228674,10 +229739,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228686,14 +229751,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228758,35 +229823,35 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1423 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x128x8_SE_AMAS3_DTL0_EPS1_GRVW1_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 + SolutionIndex: 1429 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 16, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 + AssertMinApproxSize: 1 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -228794,51 +229859,47 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true + ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 - GuaranteeNoPartialA: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 128 + LVCB: 16 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3616 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 576 + LdsNumElements: 2832 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 2048 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 1 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -228849,11 +229910,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 64 + MacroTile1: 48 MacroTileA: 128 - MacroTileB: 64 + MacroTileB: 48 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -228862,14 +229923,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 24 + NumGlobalWriteVectorsPerThread: 24 + NumLoadsA: 8 + NumLoadsB: 3 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 - NumLoadsPerpendicularB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 3 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -228884,7 +229945,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: false ProblemType: AssignedDerivedParameters: true @@ -228934,31 +229995,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1424 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR0_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 + SolutionIndex: 1430 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x48x16_SE_AMAS1_DTL0_EPS0_GRVW1_LPB1_NLCA1_PBD0_PGR0_PLR0_TT4_6_USFGRO1_VW1_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 16 - SubGroupA: 16 - SubGroupB: 16 + SubGroup0: 32 + SubGroup1: 8 + SubGroupA: 32 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [8, 4] - ThreadTile0: 8 - ThreadTile1: 4 - ThreadTileA: 8 - ThreadTileB: 4 + ThreadTile: [4, 6] + ThreadTile0: 4 + ThreadTile1: 6 + ThreadTileA: 4 + ThreadTileB: 6 UnrollMemFence: false - UseSgprForGRO: false + UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [16, 16, 1] + VectorWidth: 1 + WorkGroup: [32, 8, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -228979,7 +230040,7 @@ ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -228997,22 +230058,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 64 + LSCA: 128 LSCB: 8 LSPA: 8 LSPB: 64 - LVCA: 16 - LVCB: 2 + LVCA: 32 + LVCB: 4 LVPA: 2 - LVPB: 16 + LVPB: 32 LdcEqualsLdd: false - LdsNumElements: 2048 - LdsNumElementsAlignedA: 512 + LdsNumElements: 3584 + LdsNumElementsAlignedA: 1024 LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 1024 - LdsOffsetB: 512 - LdsOffsetB_Blk: 1536 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -229026,9 +230087,9 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 64 + MacroTile0: 128 MacroTile1: 64 - MacroTileA: 64 + MacroTileA: 128 MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -229046,7 +230107,7 @@ NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 NumLoadsPerpendicularB: 1 - NumThreads: 128 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -229061,7 +230122,7 @@ PersistentKernel: 0 PrefetchAcrossPersistent: 0 PrefetchGlobalRead: true - PrefetchLocalRead: false + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true @@ -229110,14 +230171,14 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1425 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT64x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR0_TT4_8_USFGRO0_VW4_WG16_8_1_WGM1 + SolutionIndex: 1431 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 + SubGroup0: 32 SubGroup1: 8 - SubGroupA: 16 + SubGroupA: 32 SubGroupB: 8 SuppressNoLoadLoop: false ThreadTile: [4, 8] @@ -229131,8 +230192,8 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 1 + WorkGroup: [32, 8, 1] + WorkGroupMapping: 64 WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 @@ -229146,16 +230207,17 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -229174,23 +230236,23 @@ InterleaveAlpha: 0 KernelLanguage: Assembly LSCA: 128 - LSCB: 8 + LSCB: 16 LSPA: 8 - LSPB: 32 + LSPB: 64 LVCA: 32 - LVCB: 8 + LVCB: 4 LVPA: 2 - LVPB: 32 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3360 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 320 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 4 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -229201,11 +230263,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 8 + LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229214,13 +230276,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 1 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 NumLoadsB: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularA: 2 NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 @@ -229244,6 +230306,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229267,6 +230330,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229286,20 +230350,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1426 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x32x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB4_NLCA1_PBD0_PGR1_PLR1_TT4_4_USFGRO0_VW4_WG32_8_1_WGM64 + SolutionIndex: 1432 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] - ThreadTile0: 4 + ThreadTile: [8, 4] + ThreadTile0: 8 ThreadTile1: 4 - ThreadTileA: 4 + ThreadTileA: 8 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: false @@ -229307,14 +230371,14 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 1 + AssertMinApproxSize: 3 AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true @@ -229326,24 +230390,25 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 @@ -229351,18 +230416,22 @@ KernelLanguage: Assembly LSCA: 128 LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 128 - LVCB: 16 + LSPA: 8 + LSPB: 64 + LVCA: 32 + LVCB: 4 LVPA: 2 LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2832 + LdsNumElements: 7168 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 1024 LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 LdsPadA: 0 - LdsPadB: 1 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -229375,9 +230444,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 128 - MacroTile1: 48 + MacroTile1: 64 MacroTileA: 128 - MacroTileB: 48 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229386,14 +230455,12 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 24 - NumGlobalWriteVectorsPerThread: 24 - NumLoadsA: 8 - NumLoadsB: 3 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 3 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 1 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -229408,14 +230475,15 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: false + PrefetchGlobalRead: true + PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true Batched: true ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229439,6 +230507,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229458,31 +230527,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1427 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x48x16_SE_AMAS1_DTL0_EPS0_GRVW1_LPB1_NLCA1_PBD0_PGR0_PLR0_TT4_6_USFGRO1_VW1_WG32_8_1_WGM1 + SolutionIndex: 1433 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPB0_PGR1_PLR1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 6] - ThreadTile0: 4 - ThreadTile1: 6 - ThreadTileA: 4 - ThreadTileB: 6 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 1 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -229494,16 +230565,17 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 8 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 2 + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -229514,35 +230586,35 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: false GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 128 - LSCB: 8 - LSPA: 8 - LSPB: 64 - LVCA: 32 - LVCB: 4 - LVPA: 2 - LVPB: 32 + LSCA: 16 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3584 - LdsNumElementsAlignedA: 1024 - LdsNumElementsAlignedB: 512 + LdsNumElements: 3648 + LdsNumElementsAlignedA: 512 + LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 - LdsOffsetB: 1024 - LdsOffsetB_Blk: 3072 + LdsOffsetB: 512 + LdsOffsetB_Blk: 2560 LdsPadA: 0 - LdsPadB: 0 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 4 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false @@ -229550,10 +230622,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 8 - MacroTile0: 128 - MacroTile1: 64 - MacroTileA: 128 - MacroTileB: 64 + MacroTile0: 16 + MacroTile1: 32 + MacroTileA: 16 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -229562,10 +230634,8 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 1 - NumLoadsB: 1 + NumElementsPerThread: 2 + NumGlobalWriteVectorsPerThread: 1 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 1 @@ -229592,6 +230662,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 0 DestDataType: 0 HighPrecisionAccumulate: false @@ -229615,6 +230686,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: true TLUB: false @@ -229634,31 +230706,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1428 - SolutionNameMin: Cijk_Ailk_Bljk_SB_MT128x64x8_SE_AMAS3_DTL0_EPS1_GRVW4_LPB0_NLCA1_PBD0_PGR1_PLR1_TT4_8_USFGRO0_VW4_WG32_8_1_WGM64 + SolutionIndex: 1434 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT16x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPB2_PGR1_PLR1_TT4_2_USFGRO0_VW2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 32 - SubGroup1: 8 - SubGroupA: 32 - SubGroupB: 8 + SubGroup0: 4 + SubGroup1: 16 + SubGroupA: 4 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 8 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 8 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: false Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 - WorkGroup: [32, 8, 1] - WorkGroupMapping: 64 + VectorWidth: 2 + WorkGroup: [4, 16, 4] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 3 + _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [704, 1024, 1, 128] - [102, 3019.46] @@ -236130,6436 +237204,6448 @@ - [714, 7844.54] - - [1225, 64, 64, 256] - [721, 8721.52] + - - [65, 6400, 1, 1024] + - [722, 2839.89] + - - [256, 6400, 1, 4096] + - [723, 7361.66] + - - [1024, 64, 1, 4096] + - [724, 3787.18] - - [704, 1024, 1, 128] - - [824, 3019.46] + - [827, 3019.46] - - [1024, 1024, 1, 3328] - - [862, 8162.55] + - [865, 8162.55] - - [4, 704, 1, 1280] - - [765, 319.546] + - [768, 319.546] - - [4, 1856, 1, 3328] - - [795, 550.514] + - [798, 550.514] - - [1856, 448, 1, 3328] - - [847, 6813.05] + - [850, 6813.05] - - [2944, 4288, 1, 1280] - - [856, 8975.76] + - [859, 8975.76] - - [2368, 64, 1, 3328] - - [770, 5482.23] + - [773, 5482.23] - - [1760, 32, 1, 1760] - - [809, 3859.94] + - [812, 3859.94] - - [2368, 5888, 1, 256] - - [853, 8656.73] + - [856, 8656.73] - - [5888, 1856, 1, 256] - - [843, 7881.43] + - [846, 7881.43] - - [64, 3584, 1, 1280] - - [779, 4835.33] + - [782, 4835.33] - - [512, 24000, 1, 1536] - - [850, 8665.9] + - [853, 8665.9] - - [128, 6784, 1, 3328] - - [847, 7062.25] + - [850, 7062.25] - - [5888, 1408, 1, 256] - - [860, 8130.22] + - [863, 8130.22] - - [5888, 1856, 1, 3328] - - [850, 8840.75] + - [853, 8840.75] - - [512, 4, 1, 512] - - [735, 170.223] + - [738, 170.223] - - [35, 1500, 1, 2560] - - [739, 2896.55] + - [742, 2896.55] - - [1856, 4288, 1, 256] - - [839, 8374.63] + - [842, 8374.63] - - [1024, 5056, 1, 128] - - [836, 3304.25] + - [839, 3304.25] - - [5056, 5056, 1, 3328] - - [850, 8905.43] + - [853, 8905.43] - - [1408, 5888, 1, 1280] - - [850, 9418.1] + - [853, 9418.1] - - [2368, 448, 1, 128] - - [824, 3074.97] + - [827, 3074.97] - - [6144, 6000, 1, 2560] - - [850, 9336.33] + - [853, 9336.33] - - [2368, 6784, 1, 128] - - [823, 4919.26] + - [826, 4919.26] - - [1024, 3584, 1, 3328] - - [841, 8071.07] + - [844, 8071.07] - - [512, 48000, 1, 2048] - - [850, 8763.06] + - [853, 8763.06] - - [1408, 64, 1, 128] - - [746, 805.47] + - [749, 805.47] - - [256, 4288, 1, 3328] - - [872, 6331.86] + - [875, 6331.86] - - [5888, 1408, 1, 1280] - - [840, 9226.17] + - [843, 9226.17] - - [704, 1856, 1, 3328] - - [866, 6309.4] + - [869, 6309.4] - - [1408, 4288, 1, 256] - - [850, 8374.5] + - [853, 8374.5] - - [1024, 2368, 1, 256] - - [847, 7341.02] + - [850, 7341.02] - - [64, 4, 1, 256] - - [790, 13.0032] + - [793, 13.0032] - - [1408, 1856, 1, 1280] - - [857, 8772.95] + - [860, 8772.95] - - [1408, 64, 1, 1280] - - [803, 4049.98] + - [806, 4049.98] - - [448, 1024, 1, 1280] - - [866, 6071.16] + - [869, 6071.16] - - [4096, 32, 1, 4096] - - [800, 5491.72] + - [803, 5491.72] - - [256, 1408, 1, 3328] - - [852, 5351.39] + - [855, 5351.39] - - [5056, 5056, 1, 1280] - - [860, 9408.57] + - [863, 9408.57] - - [448, 5056, 1, 256] - - [865, 6680.44] + - [868, 6680.44] - - [704, 1856, 1, 1280] - - [842, 7503.93] + - [845, 7503.93] - - [128, 5056, 1, 128] - - [757, 2316.48] + - [760, 2316.48] - - [2368, 128, 1, 256] - - [842, 3660.12] + - [845, 3660.12] - - [1856, 1408, 1, 128] - - [829, 3885.87] + - [832, 3885.87] - - [64, 5056, 1, 256] - - [852, 3318.81] + - [855, 3318.81] - - [6784, 256, 1, 3328] - - [850, 7590.54] + - [853, 7590.54] - - [1408, 3584, 1, 256] - - [839, 8276.82] + - [842, 8276.82] - - [4288, 448, 1, 256] - - [852, 7139.69] + - [855, 7139.69] - - [64, 704, 1, 128] - - [753, 375.467] + - [756, 375.467] - - [1024, 1856, 1, 128] - - [822, 2890.56] + - [825, 2890.56] - - [4288, 2944, 1, 1280] - - [856, 8981.35] + - [859, 8981.35] - - [704, 5056, 1, 1280] - - [842, 7684.62] + - [845, 7684.62] - - [2368, 704, 1, 3328] - - [857, 7070.04] + - [860, 7070.04] - - [256, 5888, 1, 256] - - [842, 7319.35] + - [845, 7319.35] - - [1856, 4288, 1, 3328] - - [840, 9238.59] + - [843, 9238.59] - - [256, 2944, 1, 256] - - [842, 6090.21] + - [845, 6090.21] - - [5888, 1024, 1, 256] - - [846, 8269.95] + - [849, 8269.95] - - [448, 64, 1, 1280] - - [799, 2493.22] + - [802, 2493.22] - - [3072, 64, 1, 1024] - - [782, 3149.67] + - [785, 3149.67] - - [3584, 4, 1, 1280] - - [884, 567.762] + - [887, 567.762] - - [2560, 16, 1, 2560] - - [791, 2887.05] + - [794, 2887.05] - - [2944, 64, 1, 256] - - [782, 2565.66] + - [785, 2565.66] - - [128, 4, 1, 1280] - - [885, 78.7692] + - [888, 78.7692] - - [1408, 2944, 1, 256] - - [846, 8337.2] + - [849, 8337.2] - - [256, 1856, 1, 1280] - - [872, 6267.25] + - [875, 6267.25] - - [6784, 5056, 1, 3328] - - [856, 9423.9] + - [859, 9423.9] - - [5056, 5056, 1, 256] - - [843, 8758.23] + - [846, 8758.23] - - [128, 256, 1, 256] - - [798, 1205.26] + - [801, 1205.26] - - [64, 1024, 1, 1280] - - [809, 3566.58] + - [812, 3566.58] - - [2944, 4, 1, 256] - - [762, 319.349] + - [765, 319.349] - - [704, 5056, 1, 128] - - [831, 4073.73] + - [834, 4073.73] - - [4, 2368, 1, 1280] - - [790, 496.892] + - [793, 496.892] - - [2368, 2944, 1, 1280] - - [839, 9085.45] + - [842, 9085.45] - - [448, 448, 1, 3328] - - [817, 5428.66] + - [820, 5428.66] - - [6784, 6784, 1, 1280] - - [856, 8726.93] + - [859, 8726.93] - - [1024, 256, 1, 3328] - - [866, 5499.32] + - [869, 5499.32] - - [1408, 4288, 1, 1280] - - [840, 9094.32] + - [843, 9094.32] - - [3584, 4288, 1, 1280] - - [843, 8703.78] + - [846, 8703.78] - - [512, 6000, 1, 2560] - - [846, 8474.46] + - [849, 8474.46] - - [2368, 704, 1, 1280] - - [852, 7651.49] + - [855, 7651.49] - - [5056, 4288, 1, 3328] - - [860, 8545.25] + - [863, 8545.25] - - [3584, 2368, 1, 3328] - - [848, 8797.78] + - [851, 8797.78] - - [5888, 6784, 1, 1280] - - [846, 8785.08] + - [849, 8785.08] - - [64, 704, 1, 1280] - - [769, 2783.38] + - [772, 2783.38] - - [4288, 256, 1, 256] - - [842, 6162.68] + - [845, 6162.68] - - [2944, 128, 1, 128] - - [744, 1951.23] + - [747, 1951.23] - - [6144, 32, 1, 2560] - - [803, 4588.95] + - [806, 4588.95] - - [6784, 448, 1, 1280] - - [847, 8674.21] + - [850, 8674.21] - - [2944, 5888, 1, 256] - - [860, 8991.66] + - [863, 8991.66] - - [64, 64, 1, 1280] - - [820, 712.348] + - [823, 712.348] - - [4288, 2944, 1, 256] - - [856, 8678.04] + - [859, 8678.04] - - [5888, 704, 1, 1280] - - [846, 8652.61] + - [849, 8652.61] - - [5056, 4, 1, 3328] - - [762, 650.672] + - [765, 650.672] - - [1856, 64, 1, 1280] - - [779, 4471.87] + - [782, 4471.87] - - [1760, 16, 1, 1760] - - [819, 2592.13] + - [822, 2592.13] - - [448, 5888, 1, 128] - - [829, 3822.93] + - [832, 3822.93] - - [5888, 64, 1, 3328] - - [811, 6013.12] + - [814, 6013.12] - - [2944, 256, 1, 3328] - - [852, 7791.35] + - [855, 7791.35] - - [1024, 64, 1, 128] - - [753, 592.416] + - [756, 592.416] - - [5056, 2368, 1, 1280] - - [839, 9260.43] + - [842, 9260.43] - - [448, 3584, 1, 1280] - - [860, 6771.24] + - [863, 6771.24] - - [6784, 5888, 1, 256] - - [854, 7933.29] + - [857, 7933.29] - - [64, 1024, 1, 3328] - - [803, 4782.98] + - [806, 4782.98] - - [704, 128, 1, 1280] - - [809, 3971.88] + - [812, 3971.88] - - [4, 3584, 1, 128] - - [878, 59.4238] + - [881, 59.4238] - - [1408, 448, 1, 1280] - - [852, 5902.07] + - [855, 5902.07] - - [1024, 1408, 1, 256] - - [847, 5272.84] + - [850, 5272.84] - - [2368, 2368, 1, 3328] - - [852, 8488.66] + - [855, 8488.66] - - [1856, 6784, 1, 128] - - [829, 4742.41] + - [832, 4742.41] - - [5056, 704, 1, 3328] - - [855, 7772.38] + - [858, 7772.38] - - [1408, 1856, 1, 256] - - [873, 5229.74] + - [876, 5229.74] - - [1408, 704, 1, 3328] - - [873, 6954.83] + - [876, 6954.83] - - [2368, 5056, 1, 256] - - [846, 8580.58] + - [849, 8580.58] - - [1408, 256, 1, 1280] - - [872, 4790.01] + - [875, 4790.01] - - [3072, 128, 1, 1024] - - [868, 4579.77] + - [871, 4579.77] - - [3584, 2368, 1, 1280] - - [839, 8675.03] + - [842, 8675.03] - - [4288, 64, 1, 3328] - - [818, 5550.01] + - [821, 5550.01] - - [2368, 4, 1, 1280] - - [884, 537.418] + - [887, 537.418] - - [704, 5888, 1, 256] - - [840, 5305.78] + - [843, 5305.78] - - [6784, 2944, 1, 128] - - [836, 4344.11] + - [839, 4344.11] - - [6784, 64, 1, 256] - - [866, 4496.32] + - [869, 4496.32] - - [2944, 256, 1, 256] - - [852, 6553.6] + - [855, 6553.6] - - [2944, 6784, 1, 3328] - - [840, 8895.66] + - [843, 8895.66] - - [128, 1, 1, 1408] - - [820, 25.6] + - [823, 25.6] - - [704, 1408, 1, 3328] - - [854, 7913.11] + - [857, 7913.11] - - [3584, 704, 1, 3328] - - [839, 7526.33] + - [842, 7526.33] - - [2944, 256, 1, 128] - - [823, 2830.66] + - [826, 2830.66] - - [6784, 4, 1, 1280] - - [880, 645.135] + - [883, 645.135] - - [1024, 64, 1, 1280] - - [778, 3013.15] + - [781, 3013.15] - - [8448, 4, 1, 2816] - - [730, 984.668] + - [733, 984.668] - - [448, 4288, 1, 256] - - [852, 7139.69] + - [855, 7139.69] - - [64, 3584, 1, 3328] - - [776, 5683.17] + - [779, 5683.17] - - [704, 2368, 1, 1280] - - [860, 7045.2] + - [863, 7045.2] - - [1856, 2368, 1, 1280] - - [857, 8327.8] + - [860, 8327.8] - - [2368, 128, 1, 3328] - - [793, 6082.55] + - [796, 6082.55] - - [64, 193600, 1, 64] - - [842, 6747.67] + - [845, 6747.67] - - [1760, 128, 1, 1760] - - [770, 5512.97] + - [773, 5512.97] - - [448, 1408, 1, 256] - - [852, 5591.44] + - [855, 5591.44] - - [1856, 4288, 1, 1280] - - [850, 8647.62] + - [853, 8647.62] - - [64, 5056, 1, 3328] - - [810, 6096.49] + - [813, 6096.49] - - [512, 1500, 1, 2816] - - [852, 7879.2] + - [855, 7879.2] - - [1024, 448, 1, 128] - - [824, 1844.23] + - [827, 1844.23] - - [704, 4, 1, 1280] - - [790, 341.333] + - [793, 341.333] - - [704, 256, 1, 128] - - [824, 1001.24] + - [827, 1001.24] - - [256, 193600, 1, 64] - - [860, 8113.2] + - [863, 8113.2] - - [704, 2944, 1, 128] - - [831, 3747.03] + - [834, 3747.03] - - [1408, 1024, 1, 1280] - - [857, 7080.61] + - [860, 7080.61] - - [704, 6784, 1, 256] - - [875, 6630.37] + - [878, 6630.37] - - [6784, 704, 1, 256] - - [842, 8005.76] + - [845, 8005.76] - - [5056, 1408, 1, 128] - - [833, 4303.03] + - [836, 4303.03] - - [2048, 7000, 1, 2048] - - [850, 9269.1] + - [853, 9269.1] - - [256, 3584, 1, 3328] - - [844, 7334.38] + - [847, 7334.38] - - [5056, 704, 1, 256] - - [852, 7954.02] + - [855, 7954.02] - - [128, 1408, 1, 128] - - [747, 1242.92] + - [750, 1242.92] - - [3584, 4288, 1, 3328] - - [876, 7683.71] + - [879, 7683.71] - - [5888, 1856, 1, 1280] - - [840, 8831.24] + - [843, 8831.24] - - [256, 1408, 1, 256] - - [842, 4352.58] + - [845, 4352.58] - - [5056, 64, 1, 1280] - - [809, 5011.95] + - [812, 5011.95] - - [1024, 704, 1, 256] - - [842, 5710.07] + - [845, 5710.07] - - [64, 256, 1, 128] - - [748, 149.797] + - [751, 149.797] - - [2368, 3584, 1, 1280] - - [850, 8609.58] + - [853, 8609.58] - - [1024, 256, 1, 256] - - [866, 3276.8] + - [869, 3276.8] - - [1856, 4, 1, 1280] - - [764, 497.004] + - [767, 497.004] - - [448, 448, 1, 256] - - [852, 3117.73] + - [855, 3117.73] - - [2944, 3584, 1, 3328] - - [840, 8879.35] + - [843, 8879.35] - - [7680, 32, 1, 2560] - - [810, 5310.14] + - [813, 5310.14] - - [128, 4288, 1, 128] - - [750, 2116.1] + - [753, 2116.1] - - [256, 256, 1, 3328] - - [803, 4774.6] + - [806, 4774.6] - - [128, 1024, 1, 3328] - - [804, 5894.7] + - [807, 5894.7] - - [4, 1408, 1, 3328] - - [795, 552.574] + - [798, 552.574] - - [6784, 2944, 1, 256] - - [858, 8271.08] + - [861, 8271.08] - - [64, 1856, 1, 1280] - - [809, 4167.86] + - [812, 4167.86] - - [64, 1024, 1, 128] - - [743, 589.088] + - [746, 589.088] - - [1024, 1500, 1, 2560] - - [847, 8407.78] + - [850, 8407.78] - - [1856, 2368, 1, 256] - - [842, 8092.05] + - [845, 8092.05] - - [3584, 256, 1, 128] - - [825, 2607.47] + - [828, 2607.47] - - [3584, 6784, 1, 3328] - - [859, 8558.73] + - [862, 8558.73] - - [256, 1024, 1, 256] - - [852, 3901.68] + - [855, 3901.68] - - [4, 6784, 1, 3328] - - [790, 662.475] + - [793, 662.475] - - [1024, 5888, 1, 3328] - - [850, 9161.66] + - [853, 9161.66] - - [1024, 128, 1, 1280] - - [807, 3942.02] + - [810, 3942.02] - - [3072, 32, 1, 1024] - - [784, 2840.39] + - [787, 2840.39] - - [6144, 24000, 1, 2560] - - [840, 7605.77] + - [843, 7605.77] - - [448, 1024, 1, 256] - - [842, 5062.09] + - [845, 5062.09] - - [5056, 4288, 1, 1280] - - [850, 9090.89] + - [853, 9090.89] - - [5888, 64, 1, 256] - - [852, 4449.68] + - [855, 4449.68] - - [1856, 256, 1, 1280] - - [866, 5834.36] + - [869, 5834.36] - - [64, 5888, 1, 3328] - - [804, 6152.34] + - [807, 6152.34] - - [2368, 2368, 1, 1280] - - [844, 8594.56] + - [847, 8594.56] - - [2944, 5888, 1, 128] - - [829, 4776.09] + - [832, 4776.09] - - [704, 5888, 1, 1280] - - [844, 8435.81] + - [847, 8435.81] - - [2368, 3584, 1, 128] - - [826, 4590.61] + - [829, 4590.61] - - [1856, 5056, 1, 128] - - [837, 4503.38] + - [840, 4503.38] - - [4608, 1, 1, 1536] - - [735, 226.855] + - [738, 226.855] - - [448, 256, 1, 3328] - - [779, 5415.46] + - [782, 5415.46] - - [2944, 6784, 1, 1280] - - [863, 8385.01] + - [866, 8385.01] - - [448, 1856, 1, 128] - - [833, 2618.86] + - [836, 2618.86] - - [128, 1024, 1, 128] - - [742, 940.427] + - [745, 940.427] - - [7680, 4, 1, 2560] - - [766, 985.004] + - [769, 985.004] - - [1024, 704, 1, 1280] - - [852, 7204.46] + - [855, 7204.46] - - [128, 5888, 1, 256] - - [842, 6313.42] + - [845, 6313.42] - - [1024, 5056, 1, 1280] - - [847, 8979.66] + - [850, 8979.66] - - [4288, 1024, 1, 256] - - [839, 7198.19] + - [842, 7198.19] - - [2944, 2368, 1, 128] - - [824, 4624.47] + - [827, 4624.47] - - [704, 704, 1, 3328] - - [865, 5870.61] + - [868, 5870.61] - - [704, 1408, 1, 1280] - - [854, 7680.22] + - [857, 7680.22] - - [5888, 448, 1, 1280] - - [842, 7718.56] + - [845, 7718.56] - - [3584, 256, 1, 3328] - - [847, 7523.78] + - [850, 7523.78] - - [704, 5888, 1, 3328] - - [852, 8196.89] + - [855, 8196.89] - - [704, 1856, 1, 128] - - [830, 3388.33] + - [833, 3388.33] - - [128, 3584, 1, 3328] - - [804, 6626.4] + - [807, 6626.4] - - [4, 4288, 1, 128] - - [877, 159.548] + - [880, 159.548] - - [128, 704, 1, 1280] - - [767, 4038.63] + - [770, 4038.63] - - [3584, 2944, 1, 256] - - [840, 7685.89] + - [843, 7685.89] - - [1856, 128, 1, 3328] - - [796, 6070.53] + - [799, 6070.53] - - [1856, 2368, 1, 3328] - - [857, 8460.52] + - [860, 8460.52] - - [512, 6000, 1, 2816] - - [860, 9019.45] + - [863, 9019.45] - - [2944, 448, 1, 128] - - [823, 3027.63] + - [826, 3027.63] - - [64, 193600, 1, 256] - - [866, 7080.22] + - [869, 7080.22] - - [128, 2944, 1, 1280] - - [842, 5397.77] + - [845, 5397.77] - - [448, 2944, 1, 1280] - - [852, 6996.87] + - [855, 6996.87] - - [512, 24000, 1, 2048] - - [860, 8832.57] + - [863, 8832.57] - - [128, 256, 1, 3328] - - [799, 3531.47] + - [802, 3531.47] - - [1408, 5056, 1, 3328] - - [855, 7969.84] + - [858, 7969.84] - - [1856, 1856, 1, 3328] - - [842, 8140.24] + - [845, 8140.24] - - [3584, 128, 1, 256] - - [852, 4860.95] + - [855, 4860.95] - - [448, 1408, 1, 3328] - - [842, 6353.65] + - [845, 6353.65] - - [2368, 2368, 1, 256] - - [856, 8369.27] + - [859, 8369.27] - - [4288, 4288, 1, 1280] - - [846, 8666.42] + - [849, 8666.42] - - [64, 448, 1, 1280] - - [799, 2591.82] + - [802, 2591.82] - - [5888, 1024, 1, 1280] - - [839, 8526.5] + - [842, 8526.5] - - [704, 1024, 1, 256] - - [852, 4971.7] + - [855, 4971.7] - - [1024, 12544, 1, 256] - - [890, 8611.8] + - [893, 8611.8] - - [448, 4, 1, 256] - - [795, 78.5534] + - [798, 78.5534] - - [5888, 448, 1, 128] - - [826, 3591.93] + - [829, 3591.93] - - [512, 48000, 1, 2560] - - [860, 9237.34] + - [863, 9237.34] - - [8448, 16, 1, 2816] - - [725, 3360.11] + - [728, 3360.11] - - [704, 6784, 1, 3328] - - [861, 7774.85] + - [864, 7774.85] - - [5888, 5888, 1, 1280] - - [847, 9238.15] + - [850, 9238.15] - - [5056, 1024, 1, 1280] - - [875, 8227.78] + - [878, 8227.78] - - [448, 5888, 1, 3328] - - [850, 7777.53] + - [853, 7777.53] - - [3072, 2, 1, 1024] - - [787, 376.283] + - [790, 376.283] - - [1024, 2944, 1, 1280] - - [840, 8650.35] + - [843, 8650.35] - - [5056, 5888, 1, 1280] - - [850, 8861.5] + - [853, 8861.5] - - [4288, 5888, 1, 128] - - [830, 5048.91] + - [833, 5048.91] - - [256, 3584, 1, 256] - - [842, 6314.01] + - [845, 6314.01] - - [256, 4, 1, 1280] - - [886, 163.84] + - [889, 163.84] - - [1408, 3584, 1, 128] - - [830, 4290.12] + - [833, 4290.12] - - [256, 2944, 1, 3328] - - [852, 7620.89] + - [855, 7620.89] - - [448, 3584, 1, 128] - - [830, 3353.8] + - [833, 3353.8] - - [5888, 2944, 1, 1280] - - [840, 9498.21] + - [843, 9498.21] - - [4, 6784, 1, 1280] - - [790, 623.816] + - [793, 623.816] - - [2368, 5888, 1, 128] - - [829, 4840.19] + - [832, 4840.19] - - [35, 8457, 1, 1760] - - [736, 4059.78] + - [739, 4059.78] - - [64, 2944, 1, 128] - - [747, 1310.72] + - [750, 1310.72] - - [2368, 4, 1, 256] - - [881, 369.639] + - [884, 369.639] - - [3584, 5888, 1, 256] - - [858, 7996.23] + - [861, 7996.23] - - [2368, 1024, 1, 128] - - [824, 3914.97] + - [827, 3914.97] - - [2368, 704, 1, 128] - - [824, 3658.87] + - [827, 3658.87] - - [512, 32, 1, 512] - - [813, 1127.5] + - [816, 1127.5] - - [3584, 2368, 1, 128] - - [824, 4462.38] + - [827, 4462.38] - - [5056, 704, 1, 128] - - [823, 4062.11] + - [826, 4062.11] - - [448, 2368, 1, 128] - - [824, 2828.97] + - [827, 2828.97] - - [4, 5056, 1, 256] - - [772, 425.768] + - [775, 425.768] - - [5056, 1408, 1, 3328] - - [857, 8848.82] + - [860, 8848.82] - - [1408, 704, 1, 256] - - [852, 5394.46] + - [855, 5394.46] - - [6784, 1024, 1, 3328] - - [839, 9231.92] + - [842, 9231.92] - - [6784, 2944, 1, 3328] - - [850, 8714.74] + - [853, 8714.74] - - [7680, 1, 1, 2560] - - [786, 248.745] + - [789, 248.745] - - [1856, 1856, 1, 256] - - [851, 7586.48] + - [854, 7586.48] - - [64, 64, 1, 3328] - - [821, 1363.15] + - [824, 1363.15] - - [512, 1, 1, 512] - - [735, 43.1158] + - [738, 43.1158] - - [6784, 2368, 1, 1280] - - [852, 8665.64] + - [855, 8665.64] - - [4608, 2, 1, 1536] - - [735, 452.55] + - [738, 452.55] - - [4288, 3584, 1, 256] - - [860, 8936.6] + - [863, 8936.6] - - [4288, 5888, 1, 1280] - - [857, 8957.05] + - [860, 8957.05] - - [4608, 4, 1, 1536] - - [728, 846.637] + - [731, 846.637] - - [1024, 6000, 1, 1536] - - [850, 8398.44] + - [853, 8398.44] - - [8448, 32, 1, 2816] - - [810, 5342.97] + - [813, 5342.97] - - [448, 2944, 1, 3328] - - [857, 7246.94] + - [860, 7246.94] - - [4288, 1856, 1, 1280] - - [840, 8902.76] + - [843, 8902.76] - - [1856, 2944, 1, 3328] - - [852, 8622.76] + - [855, 8622.76] - - [256, 6784, 1, 3328] - - [852, 8050.67] + - [855, 8050.67] - - [512, 3000, 1, 1536] - - [873, 7108.02] + - [876, 7108.02] - - [64, 5888, 1, 256] - - [865, 3567.64] + - [868, 3567.64] - - [256, 5056, 1, 128] - - [832, 3041.02] + - [835, 3041.02] - - [5056, 1024, 1, 256] - - [856, 8401.37] + - [859, 8401.37] - - [704, 64, 1, 3328] - - [815, 4298.92] + - [818, 4298.92] - - [5056, 1856, 1, 3328] - - [860, 8660.67] + - [863, 8660.67] - - [4, 2944, 1, 3328] - - [790, 618.537] + - [793, 618.537] - - [512, 1500, 1, 2048] - - [872, 5481.12] + - [875, 5481.12] - - [1024, 1, 1, 500000] - - [726, 259.961] + - [729, 259.961] - - [256, 4, 1, 256] - - [790, 50.4123] + - [793, 50.4123] - - [6784, 128, 1, 3328] - - [844, 6950.81] + - [847, 6950.81] - - [4288, 1408, 1, 128] - - [824, 4539.48] + - [827, 4539.48] - - [1856, 5888, 1, 3328] - - [850, 8712.83] + - [853, 8712.83] - - [4288, 5056, 1, 256] - - [856, 8997.05] + - [859, 8997.05] - - [1408, 128, 1, 1280] - - [779, 4599.02] + - [782, 4599.02] - - [4096, 7000, 1, 4096] - - [846, 8555.79] + - [849, 8555.79] - - [5056, 256, 1, 3328] - - [852, 8257.06] + - [855, 8257.06] - - [704, 704, 1, 256] - - [842, 5852.29] + - [845, 5852.29] - - [1024, 3000, 1, 2560] - - [839, 8258.74] + - [842, 8258.74] - - [1024, 5888, 1, 1280] - - [839, 8988.89] + - [842, 8988.89] - - [6784, 2368, 1, 128] - - [825, 4562.15] + - [828, 4562.15] - - [4, 5056, 1, 1280] - - [790, 600.341] + - [793, 600.341] - - [256, 64, 1, 1280] - - [813, 1899.59] + - [816, 1899.59] - - [128, 1856, 1, 1280] - - [852, 5185.66] + - [855, 5185.66] - - [1856, 1024, 1, 1280] - - [857, 7875.85] + - [860, 7875.85] - - [6784, 4288, 1, 1280] - - [860, 8981.08] + - [863, 8981.08] - - [1856, 1856, 1, 1280] - - [841, 7794.61] + - [844, 7794.61] - - [35, 1500, 1, 2048] - - [741, 2192.5] + - [744, 2192.5] - - [3072, 24000, 1, 1024] - - [853, 8690.48] + - [856, 8690.48] - - [1408, 5056, 1, 1280] - - [852, 8427.77] + - [855, 8427.77] - - [4, 2368, 1, 3328] - - [795, 594.322] + - [798, 594.322] - - [5888, 1856, 1, 128] - - [824, 4293.95] + - [827, 4293.95] - - [448, 704, 1, 1280] - - [847, 4136.29] + - [850, 4136.29] - - [448, 6784, 1, 128] - - [825, 3976.1] + - [828, 3976.1] - - [1024, 448, 1, 3328] - - [857, 6376.23] + - [860, 6376.23] - - [2944, 128, 1, 256] - - [842, 4466.16] + - [845, 4466.16] - - [5056, 3584, 1, 128] - - [830, 4997.08] + - [833, 4997.08] - - [5888, 5888, 1, 3328] - - [860, 8870.27] + - [863, 8870.27] - - [6784, 1024, 1, 256] - - [839, 8520.43] + - [842, 8520.43] - - [2944, 2368, 1, 256] - - [876, 6174.49] + - [879, 6174.49] - - [256, 448, 1, 256] - - [852, 1844.23] + - [855, 1844.23] - - [5056, 5888, 1, 3328] - - [841, 8076.55] + - [844, 8076.55] - - [1856, 1024, 1, 256] - - [852, 7188.82] + - [855, 7188.82] - - [512, 48000, 1, 1536] - - [863, 7282.1] + - [866, 7282.1] - - [3584, 448, 1, 1280] - - [842, 6869.0] + - [845, 6869.0] - - [1024, 1024, 1, 1280] - - [852, 8027.35] + - [855, 8027.35] - - [448, 5888, 1, 256] - - [842, 5765.74] + - [845, 5765.74] - - [2048, 128, 1, 2048] - - [800, 4834.91] + - [803, 4834.91] - - [1408, 6784, 1, 3328] - - [852, 8613.66] + - [855, 8613.66] - - [448, 1024, 1, 128] - - [823, 2315.47] + - [826, 2315.47] - - [4288, 704, 1, 128] - - [824, 4138.82] + - [827, 4138.82] - - [128, 1856, 1, 128] - - [759, 1397.46] + - [762, 1397.46] - - [448, 2368, 1, 3328] - - [842, 6786.38] + - [845, 6786.38] - - [5056, 64, 1, 128] - - [824, 1664.74] + - [827, 1664.74] - - [5056, 2944, 1, 256] - - [875, 7697.39] + - [878, 7697.39] - - [6784, 5888, 1, 128] - - [824, 5003.57] + - [827, 5003.57] - - [1024, 700, 1, 512] - - [852, 6036.21] + - [855, 6036.21] - - [3072, 1, 1, 128] - - [806, 70.2171] + - [809, 70.2171] - - [1024, 4, 1, 256] - - [764, 154.202] + - [767, 154.202] - - [2944, 704, 1, 128] - - [830, 3696.9] + - [833, 3696.9] - - [128, 6784, 1, 1280] - - [842, 6731.41] + - [845, 6731.41] - - [1408, 3584, 1, 3328] - - [840, 9257.97] + - [843, 9257.97] - - [2368, 6784, 1, 256] - - [839, 8840.3] + - [842, 8840.3] - - [5056, 1408, 1, 1280] - - [840, 9240.74] + - [843, 9240.74] - - [5056, 4288, 1, 128] - - [835, 4309.08] + - [838, 4309.08] - - [4, 704, 1, 256] - - [790, 130.597] + - [793, 130.597] - - [4288, 2368, 1, 3328] - - [853, 8755.23] + - [856, 8755.23] - - [1408, 1856, 1, 128] - - [823, 3918.65] + - [826, 3918.65] - - [1408, 5888, 1, 3328] - - [860, 8910.37] + - [863, 8910.37] - - [1856, 256, 1, 256] - - [842, 5631.24] + - [845, 5631.24] - - [6784, 6784, 1, 256] - - [850, 9298.66] + - [853, 9298.66] - - [5888, 5056, 1, 128] - - [825, 4811.26] + - [828, 4811.26] - - [4288, 2368, 1, 128] - - [824, 4749.0] + - [827, 4749.0] - - [128, 5888, 1, 1280] - - [851, 6393.76] + - [854, 6393.76] - - [256, 4288, 1, 1280] - - [842, 6887.69] + - [845, 6887.69] - - [2368, 2944, 1, 256] - - [856, 8314.72] + - [859, 8314.72] - - [4, 1856, 1, 256] - - [879, 266.93] + - [882, 266.93] - - [3584, 1856, 1, 1280] - - [840, 8631.81] + - [843, 8631.81] - - [6784, 6784, 1, 128] - - [830, 5059.86] + - [833, 5059.86] - - [256, 1856, 1, 128] - - [823, 1858.72] + - [826, 1858.72] - - [704, 64, 1, 1280] - - [773, 2849.39] + - [776, 2849.39] - - [5888, 5056, 1, 256] - - [859, 8202.42] + - [862, 8202.42] - - [8448, 48000, 1, 2816] - - [850, 4281.84] + - [853, 4281.84] - - [512, 6000, 1, 2048] - - [842, 8047.79] + - [845, 8047.79] - - [3584, 448, 1, 256] - - [852, 6805.33] + - [855, 6805.33] - - [448, 4288, 1, 128] - - [830, 3500.73] + - [833, 3500.73] - - [7680, 64, 1, 2560] - - [785, 5957.8] + - [788, 5957.8] - - [256, 6784, 1, 256] - - [852, 7331.73] + - [855, 7331.73] - - [1408, 4288, 1, 128] - - [824, 4501.39] + - [827, 4501.39] - - [2944, 704, 1, 3328] - - [852, 8439.6] + - [855, 8439.6] - - [128, 448, 1, 256] - - [773, 1555.09] + - [776, 1555.09] - - [2048, 32, 1, 2048] - - [784, 3226.39] + - [787, 3226.39] - - [3584, 3584, 1, 256] - - [856, 8784.8] + - [859, 8784.8] - - [448, 1408, 1, 128] - - [823, 2535.82] + - [826, 2535.82] - - [128, 256, 1, 1280] - - [799, 2896.62] + - [802, 2896.62] - - [3584, 5056, 1, 256] - - [843, 8566.42] + - [846, 8566.42] - - [6784, 128, 1, 256] - - [842, 6053.87] + - [845, 6053.87] - - [4288, 4, 1, 256] - - [762, 428.8] + - [765, 428.8] - - [64, 1408, 1, 3328] - - [767, 5025.01] + - [770, 5025.01] - - [704, 448, 1, 256] - - [866, 3409.64] + - [869, 3409.64] - - [2944, 2368, 1, 1280] - - [840, 9066.25] + - [843, 9066.25] - - [448, 64, 1, 3328] - - [815, 3528.86] + - [818, 3528.86] - - [704, 6784, 1, 128] - - [829, 4212.51] + - [832, 4212.51] - - [3584, 4, 1, 3328] - - [882, 658.253] + - [885, 658.253] - - [6784, 3584, 1, 256] - - [850, 9061.74] + - [853, 9061.74] - - [704, 448, 1, 128] - - [829, 1552.7] + - [832, 1552.7] - - [256, 128, 1, 128] - - [754, 281.875] + - [757, 281.875] - - [704, 1408, 1, 128] - - [829, 3026.66] + - [832, 3026.66] - - [4, 448, 1, 128] - - [878, 5.46127] + - [881, 5.46127] - - [4288, 128, 1, 1280] - - [809, 5471.54] + - [812, 5471.54] - - [128, 1408, 1, 256] - - [852, 2813.25] + - [855, 2813.25] - - [4, 2944, 1, 256] - - [772, 316.666] + - [775, 316.666] - - [64, 128, 1, 3328] - - [820, 1872.46] + - [823, 1872.46] - - [1856, 1408, 1, 256] - - [842, 7735.79] + - [845, 7735.79] - - [5056, 2368, 1, 128] - - [824, 4830.09] + - [827, 4830.09] - - [2944, 2944, 1, 3328] - - [860, 8890.01] + - [863, 8890.01] - - [5056, 6784, 1, 256] - - [850, 9015.15] + - [853, 9015.15] - - [1856, 3584, 1, 128] - - [831, 4455.02] + - [834, 4455.02] - - [5888, 4, 1, 1280] - - [880, 641.963] + - [883, 641.963] - - [128, 2944, 1, 128] - - [749, 2036.93] + - [752, 2036.93] - - [35, 8457, 1, 2560] - - [737, 3988.13] + - [740, 3988.13] - - [3584, 6784, 1, 128] - - [824, 4774.44] + - [827, 4774.44] - - [128, 4288, 1, 256] - - [842, 4851.75] + - [845, 4851.75] - - [704, 448, 1, 3328] - - [857, 4432.53] + - [860, 4432.53] - - [2368, 6784, 1, 1280] - - [840, 9161.38] + - [843, 9161.38] - - [128, 128, 1, 3328] - - [814, 2839.89] + - [817, 2839.89] - - [5056, 1856, 1, 256] - - [856, 8380.84] + - [859, 8380.84] - - [256, 128, 1, 256] - - [798, 1165.08] + - [801, 1165.08] - - [1024, 3000, 1, 2816] - - [857, 8714.17] + - [860, 8714.17] - - [1024, 1856, 1, 256] - - [847, 7014.69] + - [850, 7014.69] - - [64, 1, 1, 1216] - - [820, 11.7205] + - [823, 11.7205] - - [4288, 64, 1, 128] - - [751, 1669.55] + - [754, 1669.55] - - [256, 448, 1, 3328] - - [775, 5152.29] + - [778, 5152.29] - - [1408, 6784, 1, 1280] - - [860, 8735.12] + - [863, 8735.12] - - [3584, 3584, 1, 1280] - - [857, 9019.99] + - [860, 9019.99] - - [7680, 24000, 1, 2560] - - [860, 6940.14] + - [863, 6940.14] - - [64, 2368, 1, 1280] - - [770, 4432.97] + - [773, 4432.97] - - [448, 2368, 1, 1280] - - [845, 5352.82] + - [848, 5352.82] - - [4608, 48000, 1, 1536] - - [839, 8129.01] + - [842, 8129.01] - - [5888, 5888, 1, 128] - - [832, 4700.81] + - [835, 4700.81] - - [64, 6784, 1, 3328] - - [842, 6170.72] + - [845, 6170.72] - - [2944, 256, 1, 1280] - - [872, 6177.55] + - [875, 6177.55] - - [2048, 16, 1, 2048] - - [794, 2167.6] + - [797, 2167.6] - - [256, 2368, 1, 128] - - [823, 2037.67] + - [826, 2037.67] - - [5056, 2368, 1, 3328] - - [840, 9040.5] + - [843, 9040.5] - - [2944, 4288, 1, 256] - - [871, 7552.12] + - [874, 7552.12] - - [1408, 3584, 1, 1280] - - [847, 8808.66] + - [850, 8808.66] - - [2368, 64, 1, 256] - - [783, 2320.41] + - [786, 2320.41] - - [1024, 128, 1, 128] - - [743, 1075.46] + - [746, 1075.46] - - [704, 128, 1, 3328] - - [776, 4984.92] + - [779, 4984.92] - - [5888, 4, 1, 128] - - [877, 33.5558] + - [880, 33.5558] - - [1856, 704, 1, 256] - - [852, 7110.88] + - [855, 7110.88] - - [1024, 1500, 1, 2816] - - [847, 8499.78] + - [850, 8499.78] - - [8448, 1, 1, 2816] - - [730, 251.369] + - [733, 251.369] - - [1024, 4, 1, 3328] - - [886, 540.932] + - [889, 540.932] - - [1024, 6000, 1, 2048] - - [847, 8698.49] + - [850, 8698.49] - - [512, 24000, 1, 2560] - - [840, 8963.6] + - [843, 8963.6] - - [6144, 3000, 1, 2560] - - [863, 8761.87] + - [866, 8761.87] - - [2368, 6784, 1, 3328] - - [857, 8867.39] + - [860, 8867.39] - - [1856, 1408, 1, 1280] - - [844, 7908.43] + - [847, 7908.43] - - [1856, 448, 1, 1280] - - [857, 6543.91] + - [860, 6543.91] - - [6784, 704, 1, 128] - - [823, 4086.35] + - [826, 4086.35] - - [4, 4, 1, 256] - - [790, 0.752941] + - [793, 0.752941] - - [128, 5888, 1, 128] - - [747, 2582.15] + - [750, 2582.15] - - [5056, 2944, 1, 128] - - [827, 4579.07] + - [830, 4579.07] - - [1408, 5888, 1, 256] - - [839, 8810.67] + - [842, 8810.67] - - [704, 2944, 1, 1280] - - [854, 8420.8] + - [857, 8420.8] - - [4288, 64, 1, 1280] - - [779, 4906.05] + - [782, 4906.05] - - [256, 64, 1, 256] - - [781, 689.853] + - [784, 689.853] - - [1024, 1024, 1, 256] - - [857, 5527.91] + - [860, 5527.91] - - [704, 1856, 1, 256] - - [841, 4452.82] + - [844, 4452.82] - - [2560, 64, 1, 2560] - - [770, 4562.99] + - [773, 4562.99] - - [3584, 704, 1, 1280] - - [847, 7898.67] + - [850, 7898.67] - - [256, 128, 1, 1280] - - [799, 2864.96] + - [802, 2864.96] - - [5888, 2368, 1, 256] - - [846, 8628.27] + - [849, 8628.27] - - [256, 2368, 1, 1280] - - [842, 6073.47] + - [845, 6073.47] - - [2944, 6784, 1, 128] - - [823, 4756.67] + - [826, 4756.67] - - [3584, 448, 1, 3328] - - [842, 7264.97] + - [845, 7264.97] - - [1408, 4, 1, 256] - - [883, 234.057] + - [886, 234.057] - - [704, 2368, 1, 3328] - - [840, 7248.88] + - [843, 7248.88] - - [2944, 448, 1, 256] - - [847, 6365.79] + - [850, 6365.79] - - [1856, 448, 1, 128] - - [825, 2976.24] + - [828, 2976.24] - - [4608, 6000, 1, 1536] - - [860, 9469.32] + - [863, 9469.32] - - [2368, 128, 1, 1280] - - [809, 4773.29] + - [812, 4773.29] - - [256, 5888, 1, 128] - - [824, 3111.9] + - [827, 3111.9] - - [64, 6784, 1, 256] - - [842, 3755.04] + - [845, 3755.04] - - [64, 5056, 1, 1280] - - [803, 4935.5] + - [806, 4935.5] - - [4, 6784, 1, 128] - - [878, 111.042] + - [881, 111.042] - - [3025, 64, 64, 64] - - [892, 6643.65] + - [895, 6643.65] - - [2944, 2944, 1, 1280] - - [840, 8869.45] + - [843, 8869.45] - - [5056, 448, 1, 3328] - - [873, 6706.1] + - [876, 6706.1] - - [4, 3584, 1, 1280] - - [790, 573.44] + - [793, 573.44] - - [1408, 128, 1, 128] - - [742, 1293.09] + - [745, 1293.09] - - [6784, 704, 1, 3328] - - [857, 8368.23] + - [860, 8368.23] - - [128, 64, 1, 1280] - - [816, 1260.31] + - [819, 1260.31] - - [2368, 256, 1, 1280] - - [842, 6154.37] + - [845, 6154.37] - - [4, 448, 1, 3328] - - [795, 351.638] + - [798, 351.638] - - [5888, 4288, 1, 128] - - [824, 4340.89] + - [827, 4340.89] - - [4, 5888, 1, 256] - - [772, 428.218] + - [775, 428.218] - - [1408, 2944, 1, 3328] - - [839, 9400.75] + - [842, 9400.75] - - [3584, 704, 1, 128] - - [826, 3392.45] + - [829, 3392.45] - - [64, 1024, 1, 256] - - [773, 1762.31] + - [776, 1762.31] - - [2368, 448, 1, 1280] - - [866, 5972.48] + - [869, 5972.48] - - [128, 3584, 1, 256] - - [842, 5224.22] + - [845, 5224.22] - - [704, 448, 1, 1280] - - [842, 4566.76] + - [845, 4566.76] - - [448, 5056, 1, 128] - - [824, 3876.09] + - [827, 3876.09] - - [6144, 4, 1, 2560] - - [766, 948.651] + - [769, 948.651] - - [5056, 3584, 1, 256] - - [856, 8162.46] + - [859, 8162.46] - - [4288, 4288, 1, 256] - - [863, 7653.24] + - [866, 7653.24] - - [1408, 5056, 1, 128] - - [830, 4554.24] + - [833, 4554.24] - - [2944, 3584, 1, 128] - - [836, 4146.9] + - [839, 4146.9] - - [3584, 2368, 1, 256] - - [857, 8194.95] + - [860, 8194.95] - - [5888, 5056, 1, 1280] - - [856, 9413.33] + - [859, 9413.33] - - [128, 1024, 1, 1280] - - [809, 4433.73] + - [812, 4433.73] - - [8448, 24000, 1, 2816] - - [850, 5227.02] + - [853, 5227.02] - - [64, 704, 1, 256] - - [773, 1441.79] + - [776, 1441.79] - - [4288, 256, 1, 1280] - - [872, 5687.7] + - [875, 5687.7] - - [3584, 3584, 1, 3328] - - [847, 9183.53] + - [850, 9183.53] - - [704, 64, 1, 128] - - [751, 402.735] + - [754, 402.735] - - [3072, 1500, 1, 128] - - [846, 7394.98] + - [849, 7394.98] - - [2048, 3136, 1, 512] - - [888, 8447.2] + - [891, 8447.2] - - [3025, 256, 64, 64] - - [896, 8063.69] + - [899, 8063.69] - - [5888, 6784, 1, 256] - - [840, 9281.91] + - [843, 9281.91] - - [4288, 2944, 1, 3328] - - [840, 9153.77] + - [843, 9153.77] - - [2944, 64, 1, 128] - - [757, 1463.43] + - [760, 1463.43] - - [1024, 128, 1, 3328] - - [807, 5377.31] + - [810, 5377.31] - - [1024, 16, 1, 500000] - - [723, 3997.03] + - [726, 3997.03] - - [4288, 128, 1, 3328] - - [811, 6053.21] + - [814, 6053.21] - - [7680, 128, 1, 2560] - - [857, 7769.14] + - [860, 7769.14] - - [256, 5056, 1, 1280] - - [866, 7200.74] + - [869, 7200.74] - - [1408, 256, 1, 128] - - [834, 1671.64] + - [837, 1671.64] - - [2944, 5888, 1, 3328] - - [846, 8642.08] + - [849, 8642.08] - - [6784, 5888, 1, 1280] - - [860, 8871.05] + - [863, 8871.05] - - [3072, 1, 1, 1024] - - [806, 205.872] + - [809, 205.872] - - [704, 128, 1, 256] - - [769, 1935.29] + - [772, 1935.29] - - [5888, 4288, 1, 1280] - - [847, 9176.6] + - [850, 9176.6] - - [1024, 24000, 1, 2048] - - [846, 8667.69] + - [849, 8667.69] - - [448, 256, 1, 1280] - - [779, 4327.85] + - [782, 4327.85] - - [5888, 3584, 1, 128] - - [824, 4669.35] + - [827, 4669.35] - - [64, 4288, 1, 3328] - - [804, 5374.94] + - [807, 5374.94] - - [448, 4, 1, 1280] - - [795, 289.616] + - [798, 289.616] - - [6784, 6784, 1, 3328] - - [853, 8306.63] + - [856, 8306.63] - - [5056, 4, 1, 1280] - - [765, 607.099] + - [768, 607.099] - - [4, 5888, 1, 3328] - - [790, 651.438] + - [793, 651.438] - - [256, 1408, 1, 1280] - - [842, 5176.99] + - [845, 5176.99] - - [3072, 16, 1, 1024] - - [801, 2207.53] + - [804, 2207.53] - - [704, 3584, 1, 128] - - [834, 3653.41] + - [837, 3653.41] - - [1024, 2, 1, 512] - - [821, 156.038] + - [824, 156.038] - - [5888, 448, 1, 3328] - - [842, 7896.75] + - [845, 7896.75] - - [2368, 4288, 1, 1280] - - [839, 8517.53] + - [842, 8517.53] - - [4288, 2944, 1, 128] - - [828, 4439.16] + - [831, 4439.16] - - [256, 64, 1, 3328] - - [814, 2704.66] + - [817, 2704.66] - - [2944, 64, 1, 3328] - - [779, 5647.05] + - [782, 5647.05] - - [6784, 64, 1, 3328] - - [852, 6434.51] + - [855, 6434.51] - - [5056, 2944, 1, 3328] - - [863, 8497.1] + - [866, 8497.1] - - [448, 128, 1, 256] - - [781, 1516.54] + - [784, 1516.54] - - [2944, 3584, 1, 256] - - [857, 8365.73] + - [860, 8365.73] - - [1408, 1408, 1, 3328] - - [840, 8440.32] + - [843, 8440.32] - - [1856, 128, 1, 1280] - - [842, 5242.83] + - [845, 5242.83] - - [3584, 3584, 1, 128] - - [824, 4385.84] + - [827, 4385.84] - - [64, 3584, 1, 256] - - [842, 3276.8] + - [845, 3276.8] - - [1408, 4, 1, 3328] - - [765, 605.404] + - [768, 605.404] - - [128, 2944, 1, 3328] - - [810, 6295.65] + - [813, 6295.65] - - [3584, 704, 1, 256] - - [847, 7711.54] + - [850, 7711.54] - - [2944, 448, 1, 3328] - - [858, 6503.87] + - [861, 6503.87] - - [1024, 2, 1, 500000] - - [727, 521.703] + - [730, 521.703] - - [3584, 1408, 1, 3328] - - [849, 8296.1] + - [852, 8296.1] - - [704, 3584, 1, 1280] - - [854, 7670.55] + - [857, 7670.55] - - [1024, 1408, 1, 128] - - [829, 2830.51] + - [832, 2830.51] - - [1856, 6784, 1, 256] - - [860, 8149.57] + - [863, 8149.57] - - [4288, 448, 1, 3328] - - [841, 7406.34] + - [844, 7406.34] - - [6784, 4288, 1, 128] - - [836, 4417.99] + - [839, 4417.99] - - [6784, 704, 1, 1280] - - [857, 8302.35] + - [860, 8302.35] - - [6144, 1, 1, 2560] - - [766, 243.327] + - [769, 243.327] - - [3584, 6784, 1, 256] - - [839, 9036.49] + - [842, 9036.49] - - [6144, 16, 1, 2560] - - [773, 3266.59] + - [776, 3266.59] - - [3584, 64, 1, 128] - - [757, 1555.09] + - [760, 1555.09] - - [5888, 1024, 1, 3328] - - [847, 8887.98] + - [850, 8887.98] - - [448, 64, 1, 128] - - [743, 247.974] + - [746, 247.974] - - [704, 6784, 1, 1280] - - [843, 7892.46] + - [846, 7892.46] - - [4, 448, 1, 256] - - [765, 70.7951] + - [768, 70.7951] - - [5888, 128, 1, 256] - - [841, 5714.99] + - [844, 5714.99] - - [4096, 16, 1, 4096] - - [787, 3251.4] + - [790, 3251.4] - - [1856, 5056, 1, 3328] - - [856, 8740.17] + - [859, 8740.17] - - [4, 6784, 1, 256] - - [879, 360.312] + - [882, 360.312] - - [1024, 3584, 1, 128] - - [824, 3456.17] + - [827, 3456.17] - - [64, 704, 1, 3328] - - [792, 3817.37] + - [795, 3817.37] - - [2368, 2944, 1, 128] - - [830, 4605.37] + - [833, 4605.37] - - [5056, 64, 1, 256] - - [842, 3863.69] + - [845, 3863.69] - - [512, 1500, 1, 1536] - - [842, 6801.46] + - [845, 6801.46] - - [512, 1, 1, 500000] - - [731, 260.968] + - [734, 260.968] - - [5888, 2944, 1, 3328] - - [846, 8501.78] + - [849, 8501.78] - - [128, 3584, 1, 1280] - - [847, 5938.54] + - [850, 5938.54] - - [1024, 704, 1, 128] - - [833, 2172.19] + - [836, 2172.19] - - [1408, 2368, 1, 128] - - [829, 4023.1] + - [832, 4023.1] - - [5888, 2368, 1, 128] - - [830, 4424.52] + - [833, 4424.52] - - [128, 5056, 1, 3328] - - [842, 6692.06] + - [845, 6692.06] - - [3584, 6784, 1, 1280] - - [840, 9488.54] + - [843, 9488.54] - - [4288, 1856, 1, 256] - - [850, 8287.42] + - [853, 8287.42] - - [1856, 5888, 1, 256] - - [861, 7707.73] + - [864, 7707.73] - - [256, 256, 1, 256] - - [808, 1613.19] + - [811, 1613.19] - - [4288, 4288, 1, 3328] - - [850, 8923.49] + - [853, 8923.49] - - [1024, 1024, 1, 128] - - [830, 2553.61] + - [833, 2553.61] - - [4288, 1408, 1, 1280] - - [850, 8930.37] + - [853, 8930.37] - - [3584, 5056, 1, 128] - - [834, 4495.05] + - [837, 4495.05] - - [4, 1024, 1, 3328] - - [790, 415.594] + - [793, 415.594] - - [4, 704, 1, 128] - - [878, 13.8634] + - [881, 13.8634] - - [4288, 2368, 1, 256] - - [875, 7134.98] + - [878, 7134.98] - - [2944, 5056, 1, 1280] - - [847, 9118.51] + - [850, 9118.51] - - [448, 6784, 1, 256] - - [871, 5430.21] + - [874, 5430.21] - - [64, 128, 1, 128] - - [754, 82.957] + - [757, 82.957] - - [1856, 2368, 1, 128] - - [830, 4422.65] + - [833, 4422.65] - - [6784, 2368, 1, 3328] - - [843, 8769.3] + - [846, 8769.3] - - [1408, 6784, 1, 128] - - [830, 4738.9] + - [833, 4738.9] - - [256, 1024, 1, 1280] - - [852, 5722.11] + - [855, 5722.11] - - [704, 4, 1, 128] - - [878, 8.56578] + - [881, 8.56578] - - [1408, 4, 1, 128] - - [878, 26.0439] + - [881, 26.0439] - - [4288, 128, 1, 256] - - [852, 4865.28] + - [855, 4865.28] - - [4288, 1856, 1, 3328] - - [839, 9249.94] + - [842, 9249.94] - - [3584, 448, 1, 128] - - [830, 3029.49] + - [833, 3029.49] - - [64, 4288, 1, 128] - - [747, 1535.28] + - [750, 1535.28] - - [64, 448, 1, 3328] - - [817, 3457.26] + - [820, 3457.26] - - [448, 4, 1, 3328] - - [795, 367.228] + - [798, 367.228] - - [256, 4, 1, 3328] - - [886, 320.289] + - [889, 320.289] - - [4, 1408, 1, 1280] - - [883, 343.939] + - [886, 343.939] - - [3584, 64, 1, 1280] - - [771, 5190.97] + - [774, 5190.97] - - [1408, 448, 1, 128] - - [831, 2218.14] + - [834, 2218.14] - - [3584, 1024, 1, 1280] - - [853, 8253.01] + - [856, 8253.01] - - [1856, 5056, 1, 256] - - [871, 7552.45] + - [874, 7552.45] - - [4, 3584, 1, 256] - - [790, 325.356] + - [793, 325.356] - - [6784, 4288, 1, 3328] - - [846, 8655.24] + - [849, 8655.24] - - [4, 2944, 1, 1280] - - [790, 547.721] + - [793, 547.721] - - [1024, 4288, 1, 256] - - [847, 7788.73] + - [850, 7788.73] - - [5888, 3584, 1, 3328] - - [850, 9173.29] + - [853, 9173.29] - - [1856, 4, 1, 256] - - [881, 282.819] + - [884, 282.819] - - [4, 256, 1, 256] - - [790, 49.6485] + - [793, 49.6485] - - [5056, 3584, 1, 3328] - - [856, 8457.43] + - [859, 8457.43] - - [1408, 128, 1, 3328] - - [810, 5714.42] + - [813, 5714.42] - - [4, 64, 1, 1280] - - [886, 42.6667] + - [889, 42.6667] - - [2368, 1408, 1, 1280] - - [847, 8224.82] + - [850, 8224.82] - - [5056, 2944, 1, 1280] - - [839, 9295.03] + - [842, 9295.03] - - [8448, 6000, 1, 2816] - - [843, 8037.87] + - [846, 8037.87] - - [4, 4, 1, 128] - - [878, 0.0433898] + - [881, 0.0433898] - - [3584, 256, 1, 256] - - [842, 6116.69] + - [845, 6116.69] - - [3584, 2944, 1, 1280] - - [839, 8796.39] + - [842, 8796.39] - - [1024, 6784, 1, 256] - - [846, 8187.76] + - [849, 8187.76] - - [4, 128, 1, 256] - - [790, 30.3407] + - [793, 30.3407] - - [6784, 448, 1, 256] - - [842, 7862.2] + - [845, 7862.2] - - [5124, 9124, 1, 2048] - - [844, 8176.31] + - [847, 8176.31] - - [2944, 5056, 1, 3328] - - [839, 9328.24] + - [842, 9328.24] - - [6784, 4, 1, 128] - - [877, 204.8] + - [880, 204.8] - - [2944, 1408, 1, 128] - - [828, 3838.1] + - [831, 3838.1] - - [448, 128, 1, 3328] - - [793, 4632.06] + - [796, 4632.06] - - [64, 2944, 1, 3328] - - [810, 5663.37] + - [813, 5663.37] - - [5056, 6784, 1, 3328] - - [846, 8420.07] + - [849, 8420.07] - - [704, 2368, 1, 128] - - [830, 3321.69] + - [833, 3321.69] - - [3072, 1500, 1, 1024] - - [847, 8221.67] + - [850, 8221.67] - - [128, 2944, 1, 256] - - [842, 4550.42] + - [845, 4550.42] - - [128, 6784, 1, 128] - - [747, 2767.66] + - [750, 2767.66] - - [3584, 4288, 1, 256] - - [846, 8808.54] + - [849, 8808.54] - - [448, 1856, 1, 256] - - [851, 5166.53] + - [854, 5166.53] - - [1856, 6784, 1, 3328] - - [843, 8339.66] + - [846, 8339.66] - - [3584, 128, 1, 3328] - - [852, 6791.47] + - [855, 6791.47] - - [64, 1856, 1, 256] - - [774, 2209.93] + - [777, 2209.93] - - [64, 448, 1, 256] - - [806, 1008.25] + - [809, 1008.25] - - [5888, 4288, 1, 256] - - [846, 8869.53] + - [849, 8869.53] - - [128, 1500, 1, 1280] - - [803, 4733.44] + - [806, 4733.44] - - [5056, 1408, 1, 256] - - [844, 7523.21] + - [847, 7523.21] - - [35, 8457, 1, 4096] - - [737, 4023.07] + - [740, 4023.07] - - [64, 256, 1, 1280] - - [798, 1941.81] + - [801, 1941.81] - - [2944, 4, 1, 128] - - [877, 95.6426] + - [880, 95.6426] - - [3584, 1024, 1, 256] - - [869, 6553.58] + - [872, 6553.58] - - [512, 6000, 1, 1536] - - [843, 7357.15] + - [846, 7357.15] - - [256, 704, 1, 256] - - [842, 2912.71] + - [845, 2912.71] - - [5888, 5888, 1, 256] - - [853, 8802.6] + - [856, 8802.6] - - [4288, 1024, 1, 1280] - - [846, 8248.73] + - [849, 8248.73] - - [5888, 128, 1, 3328] - - [796, 6848.49] + - [799, 6848.49] - - [448, 6784, 1, 3328] - - [842, 8343.68] + - [845, 8343.68] - - [2944, 1408, 1, 1280] - - [839, 9229.38] + - [842, 9229.38] - - [3072, 6000, 1, 1024] - - [860, 9014.91] + - [863, 9014.91] - - [1024, 32, 1, 512] - - [781, 1497.97] + - [784, 1497.97] - - [2944, 1856, 1, 3328] - - [856, 7176.38] + - [859, 7176.38] - - [2368, 64, 1, 128] - - [747, 1206.38] + - [750, 1206.38] - - [256, 1024, 1, 128] - - [824, 1178.18] + - [827, 1178.18] - - [3584, 5888, 1, 1280] - - [846, 9023.48] + - [849, 9023.48] - - [64, 4, 1, 128] - - [878, 0.989372] + - [881, 0.989372] - - [6784, 1856, 1, 1280] - - [840, 8964.41] + - [843, 8964.41] - - [2944, 5056, 1, 256] - - [846, 8860.02] + - [849, 8860.02] - - [5888, 256, 1, 3328] - - [857, 8308.56] + - [860, 8308.56] - - [2944, 4288, 1, 128] - - [825, 4507.51] + - [828, 4507.51] - - [3584, 1408, 1, 256] - - [840, 8234.61] + - [843, 8234.61] - - [704, 3584, 1, 3328] - - [852, 7377.16] + - [855, 7377.16] - - [5056, 448, 1, 1280] - - [841, 7145.37] + - [844, 7145.37] - - [3584, 1856, 1, 3328] - - [857, 8954.71] + - [860, 8954.71] - - [64, 1408, 1, 128] - - [754, 731.874] + - [757, 731.874] - - [4288, 6784, 1, 1280] - - [846, 9166.45] + - [849, 9166.45] - - [1024, 3000, 1, 2048] - - [857, 7723.73] + - [860, 7723.73] - - [1408, 704, 1, 1280] - - [847, 7863.0] + - [850, 7863.0] - - [2944, 1024, 1, 256] - - [840, 5034.92] + - [843, 5034.92] - - [256, 64, 1, 128] - - [746, 150.657] + - [749, 150.657] - - [2368, 4288, 1, 3328] - - [844, 8568.74] + - [847, 8568.74] - - [4, 1408, 1, 256] - - [790, 219.785] + - [793, 219.785] - - [1024, 1408, 1, 1280] - - [872, 6761.03] + - [875, 6761.03] - - [64, 64, 1, 256] - - [772, 198.594] + - [775, 198.594] - - [704, 256, 1, 3328] - - [842, 4291.52] + - [845, 4291.52] - - [6784, 5056, 1, 256] - - [841, 8544.92] + - [844, 8544.92] - - [1856, 1856, 1, 128] - - [829, 4034.83] + - [832, 4034.83] - - [4288, 5888, 1, 256] - - [860, 8997.95] + - [863, 8997.95] - - [4, 704, 1, 3328] - - [795, 452.3] + - [798, 452.3] - - [35, 8457, 1, 2048] - - [738, 3375.27] + - [741, 3375.27] - - [448, 2944, 1, 256] - - [842, 6346.64] + - [845, 6346.64] - - [4, 4288, 1, 3328] - - [795, 630.878] + - [798, 630.878] - - [2944, 6784, 1, 256] - - [869, 8002.82] + - [872, 8002.82] - - [2944, 2944, 1, 128] - - [824, 4661.31] + - [827, 4661.31] - - [4, 4, 1, 1280] - - [795, 3.04762] + - [798, 3.04762] - - [1856, 3584, 1, 1280] - - [839, 8677.56] + - [842, 8677.56] - - [64, 2944, 1, 256] - - [842, 2926.85] + - [845, 2926.85] - - [3584, 1408, 1, 1280] - - [853, 8238.8] + - [856, 8238.8] - - [448, 256, 1, 128] - - [754, 1042.62] + - [757, 1042.62] - - [4288, 448, 1, 128] - - [830, 3698.72] + - [833, 3698.72] - - [5056, 256, 1, 1280] - - [847, 7058.4] + - [850, 7058.4] - - [1856, 1408, 1, 3328] - - [844, 8348.25] + - [847, 8348.25] - - [128, 128, 1, 128] - - [754, 145.636] + - [757, 145.636] - - [1024, 4288, 1, 3328] - - [840, 8042.51] + - [843, 8042.51] - - [448, 2368, 1, 256] - - [852, 5934.9] + - [855, 5934.9] - - [1024, 4, 1, 128] - - [878, 15.83] + - [881, 15.83] - - [64, 1408, 1, 1280] - - [776, 3865.39] + - [779, 3865.39] - - [64, 6784, 1, 1280] - - [872, 5629.51] + - [875, 5629.51] - - [5056, 448, 1, 256] - - [842, 7637.81] + - [845, 7637.81] - - [2944, 2368, 1, 3328] - - [850, 9112.34] + - [853, 9112.34] - - [704, 4288, 1, 3328] - - [842, 7950.1] + - [845, 7950.1] - - [1408, 128, 1, 256] - - [842, 2898.07] + - [845, 2898.07] - - [1024, 1856, 1, 1280] - - [840, 8087.41] + - [843, 8087.41] - - [6784, 1856, 1, 256] - - [871, 7538.15] + - [874, 7538.15] - - [512, 48000, 1, 2816] - - [839, 9704.11] + - [842, 9704.11] - - [512, 3000, 1, 2816] - - [841, 7621.53] + - [844, 7621.53] - - [128, 2368, 1, 3328] - - [804, 6038.84] + - [807, 6038.84] - - [1024, 5888, 1, 256] - - [856, 8185.72] + - [859, 8185.72] - - [64, 2944, 1, 1280] - - [803, 4540.14] + - [806, 4540.14] - - [6784, 1408, 1, 256] - - [856, 8573.9] + - [859, 8573.9] - - [5056, 64, 1, 3328] - - [804, 6310.87] + - [807, 6310.87] - - [128, 704, 1, 128] - - [743, 696.518] + - [746, 696.518] - - [1408, 2368, 1, 256] - - [842, 4994.96] + - [845, 4994.96] - - [1408, 1408, 1, 256] - - [839, 7552.24] + - [842, 7552.24] - - [4, 64, 1, 128] - - [877, 1.80441] + - [880, 1.80441] - - [64, 128, 1, 1280] - - [816, 1272.54] + - [819, 1272.54] - - [1024, 8, 1, 500000] - - [724, 2013.13] + - [727, 2013.13] - - [4, 2368, 1, 128] - - [878, 49.8526] + - [881, 49.8526] - - [2368, 2368, 1, 128] - - [829, 4483.7] + - [832, 4483.7] - - [64, 5888, 1, 128] - - [746, 1957.57] + - [749, 1957.57] - - [5888, 4, 1, 3328] - - [879, 638.698] + - [882, 638.698] - - [6784, 1408, 1, 128] - - [824, 4715.51] + - [827, 4715.51] - - [1408, 5056, 1, 256] - - [856, 8557.57] + - [859, 8557.57] - - [512, 50176, 1, 128] - - [887, 8809.29] + - [890, 8809.29] - - [5056, 128, 1, 3328] - - [779, 6810.56] + - [782, 6810.56] - - [128, 128, 1, 1280] - - [813, 1899.59] + - [816, 1899.59] - - [512, 2, 1, 512] - - [733, 87.3813] + - [736, 87.3813] - - [448, 704, 1, 256] - - [852, 3765.87] + - [855, 3765.87] - - [4288, 3584, 1, 128] - - [837, 4563.67] + - [840, 4563.67] - - [2944, 128, 1, 3328] - - [779, 6507.35] + - [782, 6507.35] - - [128, 5056, 1, 1280] - - [842, 6557.75] + - [845, 6557.75] - - [3584, 5056, 1, 1280] - - [839, 9407.83] + - [842, 9407.83] - - [256, 448, 1, 1280] - - [803, 4096.0] + - [806, 4096.0] - - [704, 704, 1, 128] - - [829, 2374.21] + - [832, 2374.21] - - [5056, 4, 1, 128] - - [877, 125.42] + - [880, 125.42] - - [704, 256, 1, 1280] - - [852, 4016.13] + - [855, 4016.13] - - [64, 2368, 1, 3328] - - [809, 5159.19] + - [812, 5159.19] - - [1856, 1024, 1, 128] - - [829, 3356.37] + - [832, 3356.37] - - [1856, 64, 1, 128] - - [746, 945.544] + - [749, 945.544] - - [4096, 64, 1, 4096] - - [812, 6260.14] + - [815, 6260.14] - - [1024, 24000, 1, 1536] - - [856, 9368.4] + - [859, 9368.4] - - [704, 4288, 1, 256] - - [853, 7329.29] + - [856, 7329.29] - - [5888, 2368, 1, 1280] - - [842, 8624.61] + - [845, 8624.61] - - [6784, 1856, 1, 3328] - - [846, 9012.35] + - [849, 9012.35] - - [64, 128, 1, 256] - - [772, 374.491] + - [775, 374.491] - - [2368, 5888, 1, 1280] - - [840, 9045.66] + - [843, 9045.66] - - [5888, 256, 1, 1280] - - [857, 7999.07] + - [860, 7999.07] - - [4, 5888, 1, 1280] - - [790, 615.739] + - [793, 615.739] - - [704, 128, 1, 128] - - [746, 693.169] + - [749, 693.169] - - [1024, 4, 1, 1280] - - [885, 372.364] + - [888, 372.364] - - [2368, 1856, 1, 3328] - - [857, 8246.81] + - [860, 8246.81] - - [2368, 128, 1, 128] - - [747, 1963.43] + - [750, 1963.43] - - [2944, 704, 1, 256] - - [857, 7116.14] + - [860, 7116.14] - - [5056, 128, 1, 128] - - [750, 2519.39] + - [753, 2519.39] - - [2368, 1024, 1, 3328] - - [842, 7959.03] + - [845, 7959.03] - - [35, 700, 1, 2048] - - [738, 1766.76] + - [741, 1766.76] - - [256, 704, 1, 3328] - - [842, 4296.46] + - [845, 4296.46] - - [704, 3584, 1, 256] - - [841, 7441.51] + - [844, 7441.51] - - [704, 2944, 1, 3328] - - [858, 7195.71] + - [861, 7195.71] - - [6784, 1024, 1, 128] - - [829, 4509.08] + - [832, 4509.08] - - [256, 448, 1, 128] - - [754, 837.903] + - [757, 837.903] - - [448, 1024, 1, 3328] - - [852, 6515.55] + - [855, 6515.55] - - [2944, 1024, 1, 3328] - - [847, 8751.53] + - [850, 8751.53] - - [2944, 5056, 1, 128] - - [824, 4799.63] + - [827, 4799.63] - - [2368, 256, 1, 256] - - [841, 4754.57] + - [844, 4754.57] - - [1408, 6784, 1, 256] - - [869, 7476.99] + - [872, 7476.99] - - [6784, 1408, 1, 3328] - - [847, 8968.47] + - [850, 8968.47] - - [4288, 6784, 1, 128] - - [822, 4455.64] + - [825, 4455.64] - - [1408, 2944, 1, 128] - - [834, 3862.69] + - [837, 3862.69] - - [704, 64, 1, 256] - - [773, 1441.79] + - [776, 1441.79] - - [3072, 4, 1, 1024] - - [791, 711.703] + - [794, 711.703] - - [256, 2368, 1, 3328] - - [866, 5199.63] + - [869, 5199.63] - - [6784, 2944, 1, 1280] - - [850, 8914.35] + - [853, 8914.35] - - [4288, 1856, 1, 128] - - [830, 4683.2] + - [833, 4683.2] - - [1856, 2944, 1, 128] - - [824, 4589.24] + - [827, 4589.24] - - [6784, 448, 1, 128] - - [824, 3918.43] + - [827, 3918.43] - - [64, 3584, 1, 128] - - [755, 1468.01] + - [758, 1468.01] - - [448, 5056, 1, 1280] - - [847, 7561.3] + - [850, 7561.3] - - [4288, 5056, 1, 1280] - - [839, 9304.01] + - [842, 9304.01] - - [2368, 1856, 1, 128] - - [829, 4322.07] + - [832, 4322.07] - - [128, 448, 1, 1280] - - [809, 3336.38] + - [812, 3336.38] - - [4288, 704, 1, 256] - - [852, 7834.55] + - [855, 7834.55] - - [256, 3584, 1, 128] - - [825, 2500.86] + - [828, 2500.86] - - [5888, 704, 1, 256] - - [871, 7244.39] + - [874, 7244.39] - - [3584, 1024, 1, 128] - - [836, 3168.93] + - [839, 3168.93] - - [256, 5888, 1, 3328] - - [857, 7763.37] + - [860, 7763.37] - - [1408, 4288, 1, 3328] - - [839, 9273.7] + - [842, 9273.7] - - [6784, 4288, 1, 256] - - [847, 8825.1] + - [850, 8825.1] - - [4288, 256, 1, 128] - - [826, 2621.44] + - [829, 2621.44] - - [448, 1856, 1, 3328] - - [867, 5859.7] + - [870, 5859.7] - - [5888, 256, 1, 256] - - [857, 7124.74] + - [860, 7124.74] - - [1024, 4, 1, 500000] - - [722, 1030.1] + - [725, 1030.1] - - [6784, 1024, 1, 1280] - - [839, 9083.01] + - [842, 9083.01] - - [5888, 1024, 1, 128] - - [826, 4297.06] + - [829, 4297.06] - - [1024, 128, 1, 256] - - [842, 2086.72] + - [845, 2086.72] - - [512, 16, 1, 500000] - - [723, 3921.86] + - [726, 3921.86] - - [128, 64, 1, 3328] - - [813, 1969.87] + - [816, 1969.87] - - [448, 64, 1, 256] - - [798, 1092.27] + - [801, 1092.27] - - [2368, 256, 1, 128] - - [829, 2174.74] + - [832, 2174.74] - - [6784, 3584, 1, 1280] - - [839, 9558.72] + - [842, 9558.72] - - [1024, 6784, 1, 1280] - - [848, 8637.62] + - [851, 8637.62] - - [2944, 64, 1, 1280] - - [770, 4770.03] + - [773, 4770.03] - - [1408, 2944, 1, 1280] - - [839, 9238.37] + - [842, 9238.37] - - [256, 1856, 1, 256] - - [865, 4498.33] + - [868, 4498.33] - - [1408, 2368, 1, 3328] - - [847, 8344.87] + - [850, 8344.87] - - [2944, 4, 1, 3328] - - [882, 661.109] + - [885, 661.109] - - [128, 1408, 1, 3328] - - [810, 5641.32] + - [813, 5641.32] - - [2944, 1856, 1, 128] - - [824, 4487.94] + - [827, 4487.94] - - [256, 2944, 1, 128] - - [834, 2233.08] + - [837, 2233.08] - - [256, 6784, 1, 128] - - [823, 3139.8] + - [826, 3139.8] - - [2368, 4, 1, 128] - - [878, 38.6612] + - [881, 38.6612] - - [1408, 256, 1, 3328] - - [874, 4927.57] + - [877, 4927.57] - - [1856, 4, 1, 128] - - [878, 42.2719] + - [881, 42.2719] - - [1024, 16, 1, 512] - - [790, 1115.51] + - [793, 1115.51] - - [5056, 6784, 1, 128] - - [825, 4963.35] + - [828, 4963.35] - - [4288, 5056, 1, 128] - - [823, 4927.99] + - [826, 4927.99] - - [1856, 5888, 1, 128] - - [830, 4865.05] + - [833, 4865.05] - - [7680, 2, 1, 2560] - - [766, 499.512] + - [769, 499.512] - - [3584, 1856, 1, 256] - - [856, 7978.28] + - [859, 7978.28] - - [4288, 3584, 1, 1280] - - [856, 7852.16] + - [859, 7852.16] - - [2368, 448, 1, 256] - - [871, 5238.83] + - [874, 5238.83] - - [4288, 256, 1, 3328] - - [842, 6751.24] + - [845, 6751.24] - - [1856, 704, 1, 128] - - [824, 3525.46] + - [827, 3525.46] - - [1408, 64, 1, 256] - - [783, 1884.7] + - [786, 1884.7] - - [64, 1856, 1, 128] - - [760, 888.105] + - [763, 888.105] - - [4, 256, 1, 128] - - [877, 7.28178] + - [880, 7.28178] - - [512, 16, 1, 512] - - [790, 663.656] + - [793, 663.656] - - [704, 5888, 1, 128] - - [824, 4424.45] + - [827, 4424.45] - - [6784, 3584, 1, 128] - - [826, 3823.3] + - [829, 3823.3] - - [1024, 64, 1, 256] - - [768, 1379.71] + - [771, 1379.71] - - [64, 2368, 1, 256] - - [842, 2424.83] + - [845, 2424.83] - - [5124, 1500, 1, 2048] - - [860, 8391.74] + - [863, 8391.74] - - [4288, 5056, 1, 3328] - - [846, 9274.04] + - [849, 9274.04] - - [4, 1856, 1, 1280] - - [790, 453.374] + - [793, 453.374] - - [4288, 128, 1, 128] - - [824, 2157.7] + - [827, 2157.7] - - [512, 2, 1, 500000] - - [734, 516.795] + - [737, 516.795] - - [1408, 1408, 1, 128] - - [825, 3600.39] + - [828, 3600.39] - - [7680, 16, 1, 2560] - - [805, 3542.49] + - [808, 3542.49] - - [1856, 128, 1, 128] - - [757, 1532.7] + - [760, 1532.7] - - [5056, 2368, 1, 256] - - [869, 7683.97] + - [872, 7683.97] - - [4288, 704, 1, 3328] - - [842, 7642.86] + - [845, 7642.86] - - [448, 3584, 1, 256] - - [852, 6733.97] + - [855, 6733.97] - - [2368, 64, 1, 1280] - - [803, 3962.14] + - [806, 3962.14] - - [2368, 1024, 1, 1280] - - [854, 7989.54] + - [857, 7989.54] - - [2944, 1408, 1, 3328] - - [857, 8954.56] + - [860, 8954.56] - - [6144, 1500, 1, 2560] - - [875, 8169.97] + - [878, 8169.97] - - [4224, 1, 1, 128] - - [806, 76.8] + - [809, 76.8] - - [1024, 1408, 1, 3328] - - [872, 6961.28] + - [875, 6961.28] - - [2944, 5888, 1, 1280] - - [853, 8797.43] + - [856, 8797.43] - - [8448, 2, 1, 2816] - - [728, 496.858] + - [731, 496.858] - - [1408, 4, 1, 1280] - - [883, 471.791] + - [886, 471.791] - - [5888, 3584, 1, 256] - - [860, 8246.2] + - [863, 8246.2] - - [2368, 5056, 1, 128] - - [823, 4906.8] + - [826, 4906.8] - - [1408, 1856, 1, 3328] - - [847, 9006.7] + - [850, 9006.7] - - [4, 4, 1, 3328] - - [795, 5.73793] + - [798, 5.73793] - - [5888, 5056, 1, 3328] - - [860, 8545.0] + - [863, 8545.0] - - [7680, 6000, 1, 2560] - - [853, 7995.9] + - [856, 7995.9] - - [6784, 1408, 1, 1280] - - [847, 8888.03] + - [850, 8888.03] - - [4, 1024, 1, 1280] - - [795, 302.009] + - [798, 302.009] - - [512, 3000, 1, 2560] - - [847, 7809.33] + - [850, 7809.33] - - [704, 2944, 1, 256] - - [852, 4909.14] + - [855, 4909.14] - - [4288, 64, 1, 256] - - [852, 3264.62] + - [855, 3264.62] - - [6784, 5888, 1, 3328] - - [860, 9544.42] + - [863, 9544.42] - - [2368, 4288, 1, 128] - - [823, 4872.93] + - [826, 4872.93] - - [64, 4288, 1, 1280] - - [809, 4656.32] + - [812, 4656.32] - - [6784, 64, 1, 1280] - - [842, 6230.33] + - [845, 6230.33] - - [3584, 128, 1, 128] - - [750, 2315.47] + - [753, 2315.47] - - [1024, 6784, 1, 128] - - [824, 3758.84] + - [827, 3758.84] - - [1024, 1500, 1, 1536] - - [873, 6971.9] + - [876, 6971.9] - - [1408, 64, 1, 3328] - - [776, 5079.48] + - [779, 5079.48] - - [6784, 4, 1, 256] - - [762, 487.838] + - [765, 487.838] - - [1408, 1408, 1, 1280] - - [875, 7423.21] + - [878, 7423.21] - - [256, 2368, 1, 256] - - [842, 4986.8] + - [845, 4986.8] - - [3072, 3000, 1, 1024] - - [844, 7843.91] + - [847, 7843.91] - - [448, 4288, 1, 3328] - - [843, 7204.69] + - [846, 7204.69] - - [2368, 1408, 1, 256] - - [875, 5897.86] + - [878, 5897.86] - - [704, 2368, 1, 256] - - [842, 7000.83] + - [845, 7000.83] - - [1024, 24000, 1, 2560] - - [869, 8562.21] + - [872, 8562.21] - - [2944, 448, 1, 1280] - - [857, 7155.83] + - [860, 7155.83] - - [5888, 2368, 1, 3328] - - [856, 9252.32] + - [859, 9252.32] - - [1024, 256, 1, 128] - - [838, 1255.78] + - [841, 1255.78] - - [5124, 9124, 1, 1760] - - [850, 9168.39] + - [853, 9168.39] - - [448, 1408, 1, 1280] - - [842, 6150.24] + - [845, 6150.24] - - [448, 1856, 1, 1280] - - [857, 6489.66] + - [860, 6489.66] - - [4288, 448, 1, 1280] - - [872, 6886.92] + - [875, 6886.92] - - [5888, 704, 1, 3328] - - [852, 8230.54] + - [855, 8230.54] - - [4, 1856, 1, 128] - - [878, 26.9964] + - [881, 26.9964] - - [5056, 256, 1, 128] - - [823, 3468.91] + - [826, 3468.91] - - [1856, 256, 1, 128] - - [824, 2534.06] + - [827, 2534.06] - - [128, 2368, 1, 256] - - [842, 3660.12] + - [845, 3660.12] - - [704, 4, 1, 256] - - [790, 134.496] + - [793, 134.496] - - [1024, 6784, 1, 3328] - - [844, 8482.65] + - [847, 8482.65] - - [1408, 5888, 1, 128] - - [824, 4644.42] + - [827, 4644.42] - - [4288, 4, 1, 128] - - [877, 35.7799] + - [880, 35.7799] - - [512, 3136, 1, 2048] - - [889, 6386.59] + - [892, 6386.59] - - [1408, 1024, 1, 256] - - [842, 5440.72] + - [845, 5440.72] - - [128, 64, 1, 256] - - [772, 379.919] + - [775, 379.919] - - [8448, 1500, 1, 2816] - - [839, 9155.82] + - [842, 9155.82] - - [256, 704, 1, 128] - - [824, 895.523] + - [827, 895.523] - - [2560, 7000, 1, 2560] - - [851, 8565.56] + - [854, 8565.56] - - [5888, 64, 1, 1280] - - [866, 5007.73] + - [869, 5007.73] - - [128, 4, 1, 3328] - - [885, 165.11] + - [888, 165.11] - - [5056, 6784, 1, 1280] - - [850, 9331.38] + - [853, 9331.38] - - [1024, 448, 1, 1280] - - [852, 6501.36] + - [855, 6501.36] - - [704, 5056, 1, 3328] - - [839, 8090.03] + - [842, 8090.03] - - [128, 5056, 1, 256] - - [852, 5537.27] + - [855, 5537.27] - - [3584, 5056, 1, 3328] - - [848, 8633.14] + - [851, 8633.14] - - [1856, 4, 1, 3328] - - [886, 582.714] + - [889, 582.714] - - [4, 2944, 1, 128] - - [877, 114.192] + - [880, 114.192] - - [2368, 2944, 1, 3328] - - [856, 8749.45] + - [859, 8749.45] - - [448, 448, 1, 1280] - - [780, 4694.83] + - [783, 4694.83] - - [128, 4, 1, 128] - - [877, 4.84734] + - [880, 4.84734] - - [2368, 3584, 1, 256] - - [856, 8418.49] + - [859, 8418.49] - - [4608, 3000, 1, 1536] - - [846, 9076.37] + - [849, 9076.37] - - [1024, 256, 1, 1280] - - [852, 5562.74] + - [855, 5562.74] - - [5056, 3584, 1, 1280] - - [846, 8364.99] + - [849, 8364.99] - - [5124, 9124, 1, 4096] - - [856, 8648.48] + - [859, 8648.48] - - [7680, 48000, 1, 2560] - - [850, 4098.16] + - [853, 4098.16] - - [1856, 704, 1, 1280] - - [842, 8140.94] + - [845, 8140.94] - - [1856, 2944, 1, 1280] - - [844, 8214.3] + - [847, 8214.3] - - [4608, 1500, 1, 1536] - - [852, 8424.43] + - [855, 8424.43] - - [1024, 48000, 1, 2816] - - [843, 8513.08] + - [846, 8513.08] - - [5124, 9124, 1, 2560] - - [860, 8641.14] + - [863, 8641.14] - - [128, 1024, 1, 256] - - [774, 2356.35] + - [777, 2356.35] - - [2944, 1408, 1, 256] - - [856, 8254.19] + - [859, 8254.19] - - [4288, 1408, 1, 3328] - - [850, 9138.39] + - [853, 9138.39] - - [3584, 64, 1, 3328] - - [763, 5629.52] + - [766, 5629.52] - - [5888, 2944, 1, 128] - - [824, 4119.23] + - [827, 4119.23] - - [2944, 1024, 1, 128] - - [826, 4002.86] + - [829, 4002.86] - - [128, 1, 1, 1024] - - [820, 19.9805] + - [823, 19.9805] - - [5124, 700, 1, 2048] - - [857, 7653.74] + - [860, 7653.74] - - [4, 4288, 1, 1280] - - [790, 587.649] + - [793, 587.649] - - [6784, 5056, 1, 128] - - [829, 4855.75] + - [832, 4855.75] - - [256, 1024, 1, 3328] - - [852, 6116.18] + - [855, 6116.18] - - [3584, 4, 1, 256] - - [764, 395.476] + - [767, 395.476] - - [1856, 64, 1, 3328] - - [779, 5732.5] + - [782, 5732.5] - - [4, 128, 1, 3328] - - [885, 162.589] + - [888, 162.589] - - [256, 12544, 1, 1024] - - [889, 7628.82] + - [892, 7628.82] - - [5888, 1408, 1, 3328] - - [850, 9524.33] + - [853, 9524.33] - - [448, 2944, 1, 128] - - [824, 3163.81] + - [827, 3163.81] - - [2368, 1856, 1, 256] - - [852, 8167.26] + - [855, 8167.26] - - [256, 5056, 1, 256] - - [842, 7292.03] + - [845, 7292.03] - - [5056, 5056, 1, 128] - - [830, 5043.89] + - [833, 5043.89] - - [448, 3584, 1, 3328] - - [847, 6839.46] + - [850, 6839.46] - - [4, 5056, 1, 3328] - - [795, 639.786] + - [798, 639.786] - - [256, 256, 1, 128] - - [754, 554.802] + - [757, 554.802] - - [5888, 256, 1, 128] - - [826, 3562.37] + - [829, 3562.37] - - [4, 5056, 1, 128] - - [877, 149.807] + - [880, 149.807] - - [448, 256, 1, 256] - - [773, 2121.4] + - [776, 2121.4] - - [704, 4, 1, 3328] - - [883, 455.819] + - [886, 455.819] - - [1408, 256, 1, 256] - - [842, 4352.58] + - [845, 4352.58] - - [3584, 1856, 1, 128] - - [833, 3933.13] + - [836, 3933.13] - - [4288, 4288, 1, 128] - - [824, 4888.51] + - [827, 4888.51] - - [1856, 1024, 1, 3328] - - [860, 8242.54] + - [863, 8242.54] - - [1856, 4288, 1, 128] - - [829, 4647.3] + - [832, 4647.3] - - [1024, 6000, 1, 2560] - - [854, 8526.65] + - [857, 8526.65] - - [1024, 5056, 1, 256] - - [839, 7343.73] + - [842, 7343.73] - - [5056, 5888, 1, 128] - - [828, 4053.4] + - [831, 4053.4] - - [2368, 1408, 1, 3328] - - [842, 8466.1] + - [845, 8466.1] - - [1024, 48000, 1, 1536] - - [860, 9487.64] + - [863, 9487.64] - - [5888, 448, 1, 256] - - [873, 6081.44] + - [876, 6081.44] - - [5888, 6784, 1, 128] - - [825, 4820.17] + - [828, 4820.17] - - [2368, 4, 1, 3328] - - [884, 620.528] + - [887, 620.528] - - [6784, 5056, 1, 1280] - - [869, 8525.4] + - [872, 8525.4] - - [5056, 704, 1, 1280] - - [839, 7932.96] + - [842, 7932.96] - - [1024, 48000, 1, 2560] - - [860, 8877.84] + - [863, 8877.84] - - [4608, 32, 1, 1536] - - [789, 3556.73] + - [792, 3556.73] - - [1024, 2368, 1, 128] - - [832, 2943.65] + - [835, 2943.65] - - [128, 704, 1, 256] - - [773, 2059.7] + - [776, 2059.7] - - [2368, 448, 1, 3328] - - [852, 5290.32] + - [855, 5290.32] - - [128, 5888, 1, 3328] - - [852, 7764.33] + - [855, 7764.33] - - [448, 128, 1, 1280] - - [803, 3373.18] + - [806, 3373.18] - - [6784, 4, 1, 3328] - - [762, 675.963] + - [765, 675.963] - - [4288, 4, 1, 1280] - - [795, 564.675] + - [798, 564.675] - - [1024, 64, 1, 3328] - - [809, 4293.38] + - [812, 4293.38] - - [3072, 48000, 1, 1024] - - [859, 7826.41] + - [862, 7826.41] - - [256, 4, 1, 128] - - [878, 4.83304] + - [881, 4.83304] - - [1024, 5888, 1, 128] - - [837, 3610.36] + - [840, 3610.36] - - [3584, 5888, 1, 128] - - [825, 4722.25] + - [828, 4722.25] - - [5056, 5888, 1, 256] - - [860, 9159.01] + - [863, 9159.01] - - [2368, 1024, 1, 256] - - [852, 7482.61] + - [855, 7482.61] - - [2944, 1856, 1, 256] - - [856, 8208.9] + - [859, 8208.9] - - [1856, 6784, 1, 1280] - - [852, 8205.33] + - [855, 8205.33] - - [64, 5056, 1, 128] - - [747, 2079.25] + - [750, 2079.25] - - [64, 6784, 1, 128] - - [747, 2437.48] + - [750, 2437.48] - - [448, 704, 1, 128] - - [823, 1506.35] + - [826, 1506.35] - - [4, 1024, 1, 128] - - [878, 17.2463] + - [881, 17.2463] - - [1408, 448, 1, 256] - - [842, 5545.35] + - [845, 5545.35] - - [1408, 704, 1, 128] - - [828, 2931.55] + - [831, 2931.55] - - [64, 256, 1, 3328] - - [814, 2816.42] + - [817, 2816.42] - - [8448, 3000, 1, 2816] - - [848, 8872.89] + - [851, 8872.89] - - [6784, 448, 1, 3328] - - [842, 7555.38] + - [845, 7555.38] - - [5056, 1856, 1, 1280] - - [840, 8652.26] + - [843, 8652.26] - - [1408, 1024, 1, 3328] - - [844, 7781.32] + - [847, 7781.32] - - [2368, 256, 1, 3328] - - [848, 5391.96] + - [851, 5391.96] - - [7680, 1500, 1, 2560] - - [846, 8919.62] + - [849, 8919.62] - - [5888, 3584, 1, 1280] - - [846, 9235.75] + - [849, 9235.75] - - [1856, 3584, 1, 3328] - - [857, 8348.73] + - [860, 8348.73] - - [5888, 128, 1, 1280] - - [842, 5928.51] + - [845, 5928.51] - - [1024, 2944, 1, 256] - - [873, 6630.17] + - [876, 6630.17] - - [448, 6784, 1, 1280] - - [854, 8332.35] + - [857, 8332.35] - - [256, 3584, 1, 1280] - - [844, 7140.09] + - [847, 7140.09] - - [448, 128, 1, 128] - - [746, 552.713] + - [749, 552.713] - - [704, 5056, 1, 256] - - [852, 7959.58] + - [855, 7959.58] - - [3584, 1024, 1, 3328] - - [844, 8386.74] + - [847, 8386.74] - - [2944, 1856, 1, 1280] - - [860, 7670.19] + - [863, 7670.19] - - [128, 256, 1, 128] - - [761, 258.27] + - [764, 258.27] - - [5056, 256, 1, 256] - - [852, 5736.67] + - [855, 5736.67] - - [2944, 4288, 1, 3328] - - [839, 8730.7] + - [842, 8730.7] - - [2368, 3584, 1, 3328] - - [841, 8437.61] + - [844, 8437.61] - - [2944, 704, 1, 1280] - - [852, 8342.43] + - [855, 8342.43] - - [128, 4, 1, 256] - - [772, 24.8242] + - [775, 24.8242] - - [2944, 3584, 1, 1280] - - [854, 8322.01] + - [857, 8322.01] - - [1856, 5888, 1, 1280] - - [839, 8911.81] + - [842, 8911.81] - - [256, 256, 1, 1280] - - [803, 3653.57] + - [806, 3653.57] - - [4608, 24000, 1, 1536] - - [853, 8930.96] + - [856, 8930.96] - - [4288, 1408, 1, 256] - - [840, 8338.35] + - [843, 8338.35] - - [3584, 64, 1, 256] - - [852, 3413.97] + - [855, 3413.97] - - [64, 1856, 1, 3328] - - [779, 5460.13] + - [782, 5460.13] - - [256, 1408, 1, 128] - - [823, 1423.99] + - [826, 1423.99] - - [5888, 1408, 1, 128] - - [834, 4177.78] + - [837, 4177.78] - - [4288, 2368, 1, 1280] - - [843, 8595.95] + - [846, 8595.95] - - [4, 4288, 1, 256] - - [879, 370.854] + - [882, 370.854] - - [256, 4288, 1, 128] - - [824, 2907.89] + - [827, 2907.89] - - [256, 128, 1, 3328] - - [817, 3644.78] + - [820, 3644.78] - - [512, 8, 1, 500000] - - [729, 2025.79] + - [732, 2025.79] - - [6784, 2368, 1, 256] - - [842, 8470.31] + - [845, 8470.31] - - [5888, 128, 1, 128] - - [747, 2604.45] + - [750, 2604.45] - - [1408, 448, 1, 3328] - - [852, 6540.52] + - [855, 6540.52] - - [1024, 24000, 1, 2816] - - [869, 8363.93] + - [872, 8363.93] - - [704, 1024, 1, 1280] - - [852, 7277.18] + - [855, 7277.18] - - [1856, 256, 1, 3328] - - [842, 7039.04] + - [845, 7039.04] - - [1856, 2944, 1, 256] - - [851, 8151.49] + - [854, 8151.49] - - [5056, 1024, 1, 128] - - [825, 4422.72] + - [828, 4422.72] - - [64, 5888, 1, 1280] - - [803, 4854.52] + - [806, 4854.52] - - [7680, 3000, 1, 2560] - - [856, 8789.47] + - [859, 8789.47] - - [4224, 1500, 1, 176] - - [852, 7902.04] + - [855, 7902.04] - - [5124, 700, 1, 2560] - - [842, 8232.49] + - [845, 8232.49] - - [6784, 256, 1, 128] - - [823, 3548.82] + - [826, 3548.82] - - [5888, 704, 1, 128] - - [830, 3959.55] + - [833, 3959.55] - - [6784, 64, 1, 128] - - [758, 2150.72] + - [761, 2150.72] - - [4, 448, 1, 1280] - - [883, 267.963] + - [886, 267.963] - - [1024, 4288, 1, 1280] - - [857, 8363.62] + - [860, 8363.62] - - [2368, 5056, 1, 3328] - - [856, 8581.75] + - [859, 8581.75] - - [448, 4, 1, 128] - - [877, 16.7673] + - [880, 16.7673] - - [4, 256, 1, 3328] - - [886, 201.888] + - [889, 201.888] - - [4288, 1024, 1, 3328] - - [852, 8567.62] + - [855, 8567.62] - - [6144, 48000, 1, 2560] - - [860, 3751.58] + - [863, 3751.58] - - [1024, 5056, 1, 3328] - - [839, 9440.56] + - [842, 9440.56] - - [1024, 1856, 1, 3328] - - [860, 8244.26] + - [863, 8244.26] - - [704, 704, 1, 1280] - - [852, 5529.89] + - [855, 5529.89] - - [128, 2368, 1, 1280] - - [809, 5062.28] + - [812, 5062.28] - - [3584, 4, 1, 128] - - [878, 61.4949] + - [881, 61.4949] - - [3584, 256, 1, 1280] - - [876, 6260.14] + - [879, 6260.14] - - [4, 128, 1, 128] - - [877, 1.1587] + - [880, 1.1587] - - [128, 4288, 1, 3328] - - [788, 6186.05] + - [791, 6186.05] - - [5124, 1500, 1, 2560] - - [856, 8432.52] + - [859, 8432.52] - - [3584, 128, 1, 1280] - - [842, 6547.75] + - [845, 6547.75] - - [4, 256, 1, 1280] - - [795, 180.044] + - [798, 180.044] - - [128, 704, 1, 3328] - - [767, 5177.71] + - [770, 5177.71] - - [4288, 6784, 1, 256] - - [840, 9005.24] + - [843, 9005.24] - - [3584, 2944, 1, 3328] - - [857, 8872.17] + - [860, 8872.17] - - [128, 1856, 1, 256] - - [842, 3690.38] + - [845, 3690.38] - - [64, 4288, 1, 256] - - [842, 3007.47] + - [845, 3007.47] - - [4, 3584, 1, 3328] - - [772, 639.89] + - [775, 639.89] - - [64, 4, 1, 3328] - - [886, 98.6074] + - [889, 98.6074] - - [4, 64, 1, 3328] - - [886, 91.8069] + - [889, 91.8069] - - [35, 700, 1, 2560] - - [740, 2397.55] + - [743, 2397.55] - - [5888, 2944, 1, 256] - - [850, 9031.18] + - [853, 9031.18] - - [4, 2368, 1, 256] - - [790, 256.868] + - [793, 256.868] - - [1856, 64, 1, 256] - - [774, 2222.86] + - [777, 2222.86] - - [5056, 128, 1, 1280] - - [842, 6557.75] + - [845, 6557.75] - - [448, 4288, 1, 1280] - - [866, 6891.56] + - [869, 6891.56] - - [256, 4288, 1, 256] - - [842, 6250.41] + - [845, 6250.41] - - [1024, 4288, 1, 128] - - [826, 3951.31] + - [829, 3951.31] - - [4, 1024, 1, 256] - - [790, 182.044] + - [793, 182.044] - - [5056, 4288, 1, 256] - - [846, 8933.33] + - [849, 8933.33] - - [1024, 448, 1, 256] - - [852, 4573.23] + - [855, 4573.23] - - [1024, 3584, 1, 256] - - [847, 7447.08] + - [850, 7447.08] - - [2944, 128, 1, 1280] - - [852, 5417.17] + - [855, 5417.17] - - [2560, 32, 1, 2560] - - [789, 4076.89] + - [792, 4076.89] - - [64, 256, 1, 256] - - [806, 689.853] + - [809, 689.853] - - [1024, 4, 1, 512] - - [798, 288.07] + - [801, 288.07] - - [128, 2368, 1, 128] - - [752, 1809.58] + - [755, 1809.58] - - [256, 704, 1, 1280] - - [842, 4032.98] + - [845, 4032.98] - - [64, 2368, 1, 128] - - [743, 1165.78] + - [746, 1165.78] - - [176, 1500, 1, 1408] - - [770, 4922.03] + - [773, 4922.03] - - [448, 5888, 1, 1280] - - [852, 7550.11] + - [855, 7550.11] - - [512, 3000, 1, 2048] - - [874, 6562.34] + - [877, 6562.34] - - [5056, 448, 1, 128] - - [824, 3947.87] + - [827, 3947.87] - - [4288, 704, 1, 1280] - - [842, 8243.72] + - [845, 8243.72] - - [3584, 2944, 1, 128] - - [834, 4284.78] + - [837, 4284.78] - - [6784, 256, 1, 1280] - - [842, 7955.11] + - [845, 7955.11] - - [256, 2944, 1, 1280] - - [872, 6691.8] + - [875, 6691.8] - - [2560, 128, 1, 2560] - - [810, 5347.13] + - [813, 5347.13] - - [2368, 5888, 1, 3328] - - [847, 8918.97] + - [850, 8918.97] - - [4, 64, 1, 256] - - [795, 13.0032] + - [798, 13.0032] - - [704, 1024, 1, 3328] - - [872, 6648.02] + - [875, 6648.02] - - [2368, 1856, 1, 1280] - - [858, 8016.41] + - [861, 8016.41] - - [448, 5056, 1, 3328] - - [842, 8231.63] + - [845, 8231.63] - - [128, 448, 1, 128] - - [751, 441.108] + - [754, 441.108] - - [128, 6784, 1, 256] - - [852, 5849.95] + - [855, 5849.95] - - [512, 4, 1, 500000] - - [732, 1027.04] + - [735, 1027.04] - - [3584, 4288, 1, 128] - - [828, 4260.8] + - [831, 4260.8] - - [64, 448, 1, 128] - - [751, 253.454] + - [754, 253.454] - - [1024, 6000, 1, 2816] - - [856, 8886.04] + - [859, 8886.04] - - [5888, 4288, 1, 3328] - - [856, 8968.06] + - [859, 8968.06] - - [2368, 704, 1, 256] - - [872, 4663.14] + - [875, 4663.14] - - [256, 1856, 1, 3328] - - [844, 6480.53] + - [847, 6480.53] - - [1856, 128, 1, 256] - - [842, 3726.56] + - [845, 3726.56] - - [6784, 128, 1, 128] - - [745, 2823.91] + - [748, 2823.91] - - [3584, 1408, 1, 128] - - [828, 3666.68] + - [831, 3666.68] - - [1856, 5056, 1, 1280] - - [839, 8651.26] + - [842, 8651.26] - - [2944, 1024, 1, 1280] - - [850, 8765.11] + - [853, 8765.11] - - [5056, 4, 1, 256] - - [764, 428.588] + - [767, 428.588] - - [3584, 5888, 1, 3328] - - [850, 9347.65] + - [853, 9347.65] - - [2368, 4288, 1, 256] - - [860, 8013.0] + - [863, 8013.0] - - [1024, 2368, 1, 3328] - - [847, 8119.19] + - [850, 8119.19] - - [128, 3584, 1, 128] - - [747, 2584.52] + - [750, 2584.52] - - [704, 1408, 1, 256] - - [852, 6792.17] + - [855, 6792.17] - - [4096, 128, 1, 4096] - - [874, 6624.74] + - [877, 6624.74] - - [1024, 2944, 1, 128] - - [826, 3771.27] + - [829, 3771.27] - - [1024, 3584, 1, 1280] - - [847, 8952.61] + - [850, 8952.61] - - [4288, 5888, 1, 3328] - - [860, 9047.95] + - [863, 9047.95] - - [4288, 4, 1, 3328] - - [765, 615.106] + - [768, 615.106] - - [4608, 16, 1, 1536] - - [769, 2894.84] + - [772, 2894.84] - - [5888, 64, 1, 128] - - [756, 1827.06] + - [759, 1827.06] - - [4, 5888, 1, 128] - - [877, 179.444] + - [880, 179.444] - - [1024, 2944, 1, 3328] - - [848, 8298.67] + - [851, 8298.67] - - [2048, 64, 1, 2048] - - [777, 4963.67] + - [780, 4963.67] - - [6144, 2, 1, 2560] - - [766, 477.78] + - [769, 477.78] - - [256, 6784, 1, 1280] - - [840, 7491.84] + - [843, 7491.84] - - [1856, 3584, 1, 256] - - [852, 7580.5] + - [855, 7580.5] - - [128, 448, 1, 3328] - - [803, 4417.61] + - [806, 4417.61] - - [6784, 1856, 1, 128] - - [831, 4621.64] + - [834, 4621.64] - - [1024, 1500, 1, 2048] - - [852, 6284.4] + - [855, 6284.4] - - [5056, 128, 1, 256] - - [852, 5705.06] + - [855, 5705.06] - - [512, 24000, 1, 2816] - - [839, 8919.75] + - [842, 8919.75] - - [256, 5888, 1, 1280] - - [854, 7977.9] + - [857, 7977.9] - - [4, 128, 1, 1280] - - [795, 94.1609] + - [798, 94.1609] - - [4288, 6784, 1, 3328] - - [860, 9012.48] + - [863, 9012.48] - - [6784, 128, 1, 1280] - - [844, 6807.25] + - [847, 6807.25] - - [64, 1408, 1, 256] - - [773, 2045.09] + - [776, 2045.09] - - [2368, 1408, 1, 128] - - [824, 4340.63] + - [827, 4340.63] - - [1856, 448, 1, 256] - - [873, 3639.89] + - [876, 3639.89] - - [1408, 1024, 1, 128] - - [832, 3417.58] + - [835, 3417.58] - - [128, 64, 1, 128] - - [753, 68.6241] + - [756, 68.6241] - - [6784, 3584, 1, 3328] - - [850, 9425.53] + - [853, 9425.53] - - [1760, 7000, 1, 1760] - - [847, 8780.31] + - [850, 8780.31] - - [1024, 704, 1, 3328] - - [864, 5644.5] + - [867, 5644.5] - - [64, 64, 1, 128] - - [743, 38.1023] + - [746, 38.1023] - - [2368, 5056, 1, 1280] - - [861, 8462.31] + - [864, 8462.31] - - [64, 4, 1, 1280] - - [795, 46.5455] + - [798, 46.5455] - - [1408, 2368, 1, 1280] - - [847, 8234.98] + - [850, 8234.98] - - [128, 1408, 1, 1280] - - [809, 4491.56] + - [812, 4491.56] - - [1024, 1, 1, 512] - - [813, 81.92] + - [816, 81.92] - - [4, 1408, 1, 128] - - [877, 56.32] + - [880, 56.32] - - [704, 4288, 1, 128] - - [831, 3942.86] + - [834, 3942.86] - - [128, 1856, 1, 3328] - - [797, 6111.83] + - [800, 6111.83] - - [2944, 2944, 1, 256] - - [856, 8640.12] + - [859, 8640.12] - - [2944, 4, 1, 1280] - - [790, 554.165] + - [793, 554.165] - - [5888, 4, 1, 256] - - [772, 435.644] + - [775, 435.644] - - [6784, 256, 1, 256] - - [852, 7025.86] + - [855, 7025.86] - - [256, 5056, 1, 3328] - - [852, 8249.47] + - [855, 8249.47] - - [128, 4288, 1, 1280] - - [842, 5561.64] + - [845, 5561.64] - - [5056, 1856, 1, 128] - - [836, 3975.18] + - [839, 3975.18] - - [1024, 3000, 1, 1536] - - [857, 8544.44] + - [860, 8544.44] - - [5056, 1024, 1, 3328] - - [850, 9361.37] + - [853, 9361.37] - - [128, 128, 1, 256] - - [802, 699.051] + - [805, 699.051] - - [1760, 64, 1, 1760] - - [770, 4956.16] + - [773, 4956.16] - - [4288, 3584, 1, 3328] - - [870, 7506.08] + - [873, 7506.08] - - [448, 704, 1, 3328] - - [842, 4697.56] + - [845, 4697.56] - - [448, 448, 1, 128] - - [759, 1249.52] + - [762, 1249.52] - - [1024, 2368, 1, 1280] - - [852, 7756.34] + - [855, 7756.34] - - [1856, 704, 1, 3328] - - [852, 8340.56] + - [855, 8340.56] - - [512, 1500, 1, 2560] - - [854, 6041.29] + - [857, 6041.29] - - [5888, 6784, 1, 3328] - - [850, 9199.28] + - [853, 9199.28] - - [704, 4288, 1, 1280] - - [844, 8341.96] + - [847, 8341.96] - - [128, 50176, 1, 512] - - [890, 7589.38] + - [893, 7589.38] - - [704, 256, 1, 256] - - [842, 2912.71] + - [845, 2912.71] - - [1024, 48000, 1, 2048] - - [847, 8947.32] + - [850, 8947.32] - - [4288, 1024, 1, 128] - - [823, 4291.65] + - [826, 4291.65] - - [3136, 64, 128, 64] - - [905, 8175.06] + - [908, 8175.06] - - [784, 128, 128, 512] - - [904, 8190.53] + - [907, 8190.53] - - [784, 512, 256, 128] - - [902, 8637.14] + - [905, 8637.14] - - [3136, 256, 256, 64] - - [902, 8663.08] + - [905, 8663.08] - - [3136, 64, 128, 256] - - [900, 8943.46] + - [903, 8943.46] - - [3136, 64, 256, 64] - - [905, 8267.12] + - [908, 8267.12] - - [784, 512, 128, 128] - - [902, 8564.25] + - [905, 8564.25] - - [784, 128, 256, 512] - - [906, 8377.06] + - [909, 8377.06] - - [3136, 64, 256, 256] - - [907, 9033.88] + - [910, 9033.88] - - [3136, 256, 128, 64] - - [902, 8624.46] + - [905, 8624.46] - - [1024, 256, 1, 1024] - - [928, 6331.03] + - [931, 6331.03] - - [1024, 512, 1, 2048] - - [927, 8100.04] + - [930, 8100.04] - - [512, 200, 1, 512] - - [936, 2861.83] + - [939, 2861.83] - - [4096, 256, 1, 2048] - - [919, 8812.72] + - [922, 8812.72] - - [4096, 512, 1, 1024] - - [929, 9068.77] + - [932, 9068.77] - - [1024, 200, 1, 1024] - - [928, 5110.02] + - [931, 5110.02] - - [1024, 512, 1, 1024] - - [921, 7785.25] + - [924, 7785.25] - - [2048, 256, 1, 4096] - - [931, 8438.71] + - [934, 8438.71] - - [2048, 768, 1, 512] - - [913, 8618.43] + - [916, 8618.43] - - [512, 256, 1, 1024] - - [933, 4834.93] + - [936, 4834.93] - - [512, 768, 1, 2048] - - [930, 6908.94] + - [933, 6908.94] - - [2048, 256, 1, 1024] - - [926, 7941.88] + - [929, 7941.88] - - [1024, 256, 1, 2048] - - [923, 6997.8] + - [926, 6997.8] - - [2048, 200, 1, 512] - - [926, 5649.66] + - [929, 5649.66] - - [4096, 200, 1, 1024] - - [924, 6678.83] + - [927, 6678.83] - - [2048, 200, 1, 4096] - - [932, 6706.59] + - [935, 6706.59] - - [2048, 512, 1, 1024] - - [929, 8548.9] + - [932, 8548.9] - - [1024, 1024, 1, 512] - - [924, 8046.63] + - [927, 8046.63] - - [1024, 200, 1, 4096] - - [923, 5884.26] + - [926, 5884.26] - - [2048, 512, 1, 4096] - - [934, 8995.84] + - [937, 8995.84] - - [4096, 512, 1, 2048] - - [929, 9298.08] + - [932, 9298.08] - - [4096, 1024, 1, 2048] - - [911, 9790.67] + - [914, 9790.67] - - [2048, 1024, 1, 2048] - - [912, 9278.8] + - [915, 9278.8] - - [1024, 200, 1, 512] - - [928, 4535.36] + - [931, 4535.36] - - [1024, 1024, 1, 4096] - - [919, 8967.29] + - [922, 8967.29] - - [2048, 1024, 1, 4096] - - [914, 9500.46] + - [917, 9500.46] - - [4096, 200, 1, 2048] - - [920, 7082.58] + - [923, 7082.58] - - [2048, 200, 1, 1024] - - [926, 6211.94] + - [929, 6211.94] - - [1024, 768, 1, 512] - - [927, 7401.71] + - [930, 7401.71] - - [2048, 512, 1, 512] - - [924, 8124.56] + - [927, 8124.56] - - [2048, 200, 1, 2048] - - [926, 6561.8] + - [929, 6561.8] - - [2048, 256, 1, 2048] - - [927, 8224.13] + - [930, 8224.13] - - [512, 768, 1, 512] - - [925, 6469.36] + - [928, 6469.36] - - [512, 200, 1, 1024] - - [928, 3755.64] + - [931, 3755.64] - - [4096, 1024, 1, 1024] - - [911, 9605.85] + - [914, 9605.85] - - [4096, 256, 1, 4096] - - [934, 8961.29] + - [937, 8961.29] - - [1024, 512, 1, 512] - - [927, 7108.99] + - [930, 7108.99] - - [512, 256, 1, 512] - - [935, 4032.98] + - [938, 4032.98] - - [1024, 256, 1, 4096] - - [923, 7326.3] + - [926, 7326.3] - - [1024, 200, 1, 2048] - - [916, 5530.46] + - [919, 5530.46] - - [2048, 1024, 1, 512] - - [917, 8995.83] + - [920, 8995.83] - - [1024, 1024, 1, 2048] - - [924, 8830.11] + - [927, 8830.11] - - [4096, 256, 1, 1024] - - [924, 8581.7] + - [927, 8581.7] - - [512, 768, 1, 1024] - - [925, 6875.91] + - [928, 6875.91] - - [1024, 512, 1, 4096] - - [921, 8484.05] + - [924, 8484.05] - - [1024, 256, 1, 512] - - [918, 5667.98] + - [921, 5667.98] - - [4096, 200, 1, 4096] - - [931, 7018.59] + - [934, 7018.59] - - [2048, 256, 1, 512] - - [931, 7078.99] + - [934, 7078.99] - - [512, 200, 1, 2048] - - [936, 4283.4] + - [939, 4283.4] - - [1024, 1024, 1, 1024] - - [919, 8565.27] + - [922, 8565.27] - - [2048, 512, 1, 2048] - - [919, 8850.49] + - [922, 8850.49] - - [4096, 1024, 1, 4096] - - [912, 9843.18] + - [915, 9843.18] - - [2048, 1024, 1, 1024] - - [917, 9234.11] + - [920, 9234.11] - - [4096, 384, 1, 2048] - - [959, 8892.52] + - [962, 8892.52] - - [4096, 192, 1, 2048] - - [953, 8024.18] + - [956, 8024.18] - - [1225, 192, 64, 384] - - [942, 9373.83] + - [945, 9373.83] - - [5329, 64, 64, 160] - - [946, 9186.69] + - [949, 9186.69] - - [1225, 64, 64, 384] - - [941, 8735.76] + - [944, 8735.76] - - [289, 128, 64, 1024] - - [956, 7000.2] + - [959, 7000.2] - - [4096, 320, 1, 1280] - - [961, 8302.26] + - [964, 8302.26] - - [4096, 384, 1, 1536] - - [943, 9052.45] + - [946, 9052.45] - - [4096, 192, 1, 1280] - - [958, 7561.85] + - [961, 7561.85] - - [289, 192, 64, 1024] - - [952, 7346.99] + - [955, 7346.99] - - [1225, 96, 64, 384] - - [939, 8303.08] + - [942, 8303.08] - - [4096, 320, 1, 2048] - - [948, 8384.42] + - [951, 8384.42] - - [4096, 256, 1, 1536] - - [960, 8734.34] + - [963, 8734.34] - - [4096, 384, 1, 1280] - - [957, 9023.24] + - [960, 9023.24] - - [4096, 448, 1, 1280] - - [948, 8343.32] + - [951, 8343.32] - - [289, 256, 64, 1024] - - [951, 7535.46] + - [954, 7535.46] - - [4096, 448, 1, 2048] - - [948, 8572.31] + - [951, 8572.31] - - [289, 384, 64, 1024] - - [949, 7767.57] + - [952, 7767.57] - - [1024, 3594, 1, 4096] - - [968, 8661.42] + - [971, 8661.42] - - [4096, 3103, 1, 1024] - - [978, 9652.13] + - [981, 9652.13] - - [4096, 3136, 1, 1024] - - [962, 9723.05] + - [965, 9723.05] - - [1024, 3141, 1, 4096] - - [980, 8612.02] + - [983, 8612.02] - - [64, 147, 432, 148] - - [995, 6371.93] + - [998, 6371.93] - - [4096, 3559, 1, 1024] - - [967, 9906.25] + - [970, 9906.25] - - [4096, 3368, 1, 1024] - - [962, 9720.91] + - [965, 9720.91] - - [1024, 3335, 1, 4096] - - [986, 8990.19] + - [989, 8990.19] - - [1024, 3510, 1, 4096] - - [986, 9440.58] + - [989, 9440.58] - - [4096, 3209, 1, 1024] - - [967, 9632.66] + - [970, 9632.66] - - [4096, 3322, 1, 1024] - - [966, 9939.42] + - [969, 9939.42] - - [1024, 3400, 1, 4096] - - [985, 9155.99] + - [988, 9155.99] - - [1024, 3995, 1, 4096] - - [968, 9610.15] + - [971, 9610.15] - - [1024, 3503, 1, 4096] - - [986, 9446.47] + - [989, 9446.47] - - [4096, 3594, 1, 1024] - - [977, 9691.86] + - [980, 9691.86] - - [4096, 3473, 1, 1024] - - [966, 9698.8] + - [969, 9698.8] - - [4096, 3522, 1, 1024] - - [967, 9816.82] + - [970, 9816.82] - - [1024, 3103, 1, 4096] - - [964, 8490.95] + - [967, 8490.95] - - [1024, 3214, 1, 4096] - - [985, 8667.57] + - [988, 8667.57] - - [4096, 3449, 1, 1024] - - [977, 9795.61] + - [980, 9795.61] - - [1024, 3136, 1, 4096] - - [986, 8500.51] + - [989, 8500.51] - - [1024, 3955, 1, 33708] - - [966, 9634.84] + - [969, 9634.84] - - [1024, 3780, 1, 4096] - - [969, 9088.78] + - [972, 9088.78] - - [1024, 3906, 1, 33708] - - [967, 9515.36] + - [970, 9515.36] - - [1024, 3386, 1, 4096] - - [986, 9115.95] + - [989, 9115.95] - - [4096, 3396, 1, 1024] - - [977, 9665.5] + - [980, 9665.5] - - [1024, 3183, 1, 4096] - - [964, 8662.84] + - [967, 8662.84] - - [1024, 3098, 1, 4096] - - [980, 8490.12] + - [983, 8490.12] - - [1024, 3548, 1, 4096] - - [986, 9555.53] + - [989, 9555.53] - - [1024, 3224, 1, 4096] - - [979, 8760.78] + - [982, 8760.78] - - [4096, 3469, 1, 1024] - - [966, 9687.11] + - [969, 9687.11] - - [1024, 3582, 1, 4096] - - [983, 9690.9] + - [986, 9690.9] - - [1024, 2977, 1, 4096] - - [968, 9379.28] + - [971, 9379.28] - - [1024, 3939, 1, 1024] - - [965, 9172.01] + - [968, 9172.01] - - [64, 123, 528, 123] - - [1013, 6346.07] + - [1016, 6346.07] - - [64, 12, 5040, 12] - - [990, 1536.0] + - [993, 1536.0] - - [4096, 3176, 1, 1024] - - [978, 9712.1] + - [981, 9712.1] - - [1024, 3559, 1, 4096] - - [982, 9579.74] + - [985, 9579.74] - - [1024, 3478, 1, 4096] - - [986, 9373.75] + - [989, 9373.75] - - [4096, 3343, 1, 1024] - - [962, 9638.67] + - [965, 9638.67] - - [4096, 3440, 1, 1024] - - [962, 9853.86] + - [965, 9853.86] - - [1024, 3996, 1, 33708] - - [966, 9733.45] + - [969, 9733.45] - - [1024, 4012, 1, 4096] - - [967, 9636.89] + - [970, 9636.89] - - [1024, 3322, 1, 4096] - - [986, 8945.02] + - [989, 8945.02] - - [1024, 3990, 1, 33708] - - [966, 9720.21] + - [969, 9720.21] - - [1024, 3314, 1, 4096] - - [986, 8944.62] + - [989, 8944.62] - - [4096, 3513, 1, 1024] - - [966, 9794.85] + - [969, 9794.85] - - [1024, 3562, 1, 4096] - - [986, 9597.18] + - [989, 9597.18] - - [1024, 3443, 1, 4096] - - [986, 9279.42] + - [989, 9279.42] - - [1024, 3554, 1, 4096] - - [983, 9552.06] + - [986, 9552.06] - - [1024, 3063, 1, 4096] - - [968, 9622.48] + - [971, 9622.48] - - [64, 111, 576, 112] - - [1013, 6274.55] + - [1016, 6274.55] - - [4096, 3460, 1, 1024] - - [966, 9665.59] + - [969, 9665.59] - - [1024, 3209, 1, 4096] - - [965, 8708.29] + - [968, 8708.29] - - [1024, 3147, 1, 4096] - - [986, 8492.13] + - [989, 8492.13] - - [4096, 3387, 1, 1024] - - [963, 9761.24] + - [966, 9761.24] - - [4096, 3436, 1, 1024] - - [962, 9815.05] + - [965, 9815.05] - - [1024, 3341, 1, 4096] - - [985, 9004.97] + - [988, 9004.97] - - [1024, 3516, 1, 4096] - - [985, 9471.29] + - [988, 9471.29] - - [4096, 3277, 1, 1024] - - [966, 9807.02] + - [969, 9807.02] - - [1024, 3454, 1, 4096] - - [986, 9300.93] + - [989, 9300.93] - - [1024, 3969, 1, 4096] - - [966, 9539.72] + - [969, 9539.72] - - [1024, 3999, 1, 4096] - - [967, 9607.42] + - [970, 9607.42] - - [1024, 4032, 1, 4096] - - [968, 9693.37] + - [971, 9693.37] - - [4096, 3541, 1, 1024] - - [967, 9866.63] + - [970, 9866.63] - - [4096, 3334, 1, 1024] - - [978, 9614.31] + - [981, 9614.31] - - [1024, 3365, 1, 4096] - - [986, 9058.48] + - [989, 9058.48] - - [1024, 3527, 1, 4096] - - [986, 9510.21] + - [989, 9510.21] - - [1024, 3190, 1, 4096] - - [985, 8627.7] + - [988, 8627.7] - - [4096, 3906, 1, 1024] - - [963, 9817.68] + - [966, 9817.68] - - [1024, 3593, 1, 4096] - - [968, 8662.99] + - [971, 8662.99] - - [1024, 3336, 1, 4096] - - [986, 8991.03] + - [989, 8991.03] - - [4096, 3504, 1, 1024] - - [966, 9769.76] + - [969, 9769.76] - - [4096, 3977, 1, 1024] - - [967, 9742.52] + - [970, 9742.52] - - [1024, 3906, 1, 4096] - - [967, 9386.15] + - [970, 9386.15] - - [4096, 3415, 1, 1024] - - [977, 9802.6] + - [980, 9802.6] - - [1024, 3295, 1, 4096] - - [985, 8879.16] + - [988, 8879.16] - - [4096, 3321, 1, 1024] - - [967, 9931.33] + - [970, 9931.33] - - [1024, 3072, 1, 4096] - - [968, 9671.61] + - [971, 9671.61] - - [1024, 3408, 1, 4096] - - [985, 9182.73] + - [988, 9182.73] - - [1024, 3522, 1, 4096] - - [986, 9484.53] + - [989, 9484.53] - - [4096, 3751, 1, 1024] - - [967, 9778.76] + - [970, 9778.76] - - [4096, 3378, 1, 1024] - - [977, 9692.67] + - [980, 9692.67] - - [64, 77, 816, 77] - - [1019, 4850.19] + - [1022, 4850.19] - - [1024, 3925, 1, 33708] - - [966, 9560.78] + - [969, 9560.78] - - [1024, 3990, 1, 1024] - - [968, 9272.65] + - [971, 9272.65] - - [1024, 3290, 1, 4096] - - [979, 8905.51] + - [982, 8905.51] - - [4096, 3500, 1, 1024] - - [967, 9761.72] + - [970, 9761.72] - - [4096, 3565, 1, 1024] - - [966, 9919.27] + - [969, 9919.27] - - [1024, 3484, 1, 4096] - - [985, 9376.42] + - [988, 9376.42] - - [4096, 3395, 1, 1024] - - [978, 9788.06] + - [981, 9788.06] - - [64, 92, 688, 92] - - [1005, 5606.0] + - [1008, 5606.0] - - [1024, 3681, 1, 1024] - - [970, 8690.13] + - [973, 8690.13] - - [64, 159, 400, 159] - - [997, 6518.87] + - [1000, 6518.87] - - [1024, 3584, 1, 1024] - - [985, 9365.27] + - [988, 9365.27] - - [4096, 3093, 1, 1024] - - [977, 9623.31] + - [980, 9623.31] - - [1024, 4050, 1, 1024] - - [969, 9354.04] + - [972, 9354.04] - - [1024, 3301, 1, 4096] - - [986, 8888.94] + - [989, 8888.94] - - [1024, 3581, 1, 4096] - - [985, 9673.72] + - [988, 9673.72] - - [4096, 3374, 1, 1024] - - [978, 9707.23] + - [981, 9707.23] - - [1024, 3449, 1, 4096] - - [986, 9270.8] + - [989, 9270.8] - - [4096, 3215, 1, 1024] - - [967, 9645.15] + - [970, 9645.15] - - [4096, 3312, 1, 1024] - - [967, 9888.62] + - [970, 9888.62] - - [4096, 3479, 1, 1024] - - [967, 9698.51] + - [970, 9698.51] - - [4096, 3544, 1, 1024] - - [967, 9874.99] + - [970, 9874.99] - - [1024, 3263, 1, 4096] - - [986, 8787.51] + - [989, 8787.51] - - [4096, 3455, 1, 1024] - - [977, 9845.19] + - [980, 9845.19] - - [1024, 3379, 1, 4096] - - [983, 9099.91] + - [986, 9099.91] - - [1024, 3490, 1, 4096] - - [986, 9397.39] + - [989, 9397.39] - - [1024, 3368, 1, 4096] - - [986, 9079.15] + - [989, 9079.15] - - [4096, 3186, 1, 1024] - - [962, 9750.07] + - [965, 9750.07] - - [1024, 3428, 1, 4096] - - [986, 9232.82] + - [989, 9232.82] - - [64, 85, 752, 84] - - [1001, 5342.57] + - [1004, 5342.57] - - [4096, 3561, 1, 1024] - - [967, 9913.92] + - [970, 9913.92] - - [4096, 3418, 1, 1024] - - [977, 9765.76] + - [980, 9765.76] - - [1024, 3064, 1, 4096] - - [968, 9621.58] + - [971, 9621.58] - - [4096, 3259, 1, 1024] - - [967, 9765.42] + - [970, 9765.42] - - [4096, 3308, 1, 1024] - - [966, 9900.36] + - [969, 9900.36] - - [1024, 3533, 1, 4096] - - [986, 9520.02] + - [989, 9520.02] - - [1024, 3344, 1, 4096] - - [986, 9014.45] + - [989, 9014.45] - - [1024, 4030, 1, 1024] - - [968, 9354.0] + - [971, 9354.0] - - [4096, 3459, 1, 1024] - - [967, 9656.1] + - [970, 9656.1] - - [1024, 3572, 1, 4096] - - [983, 9639.97] + - [986, 9639.97] - - [1024, 3925, 1, 1024] - - [979, 9173.64] + - [982, 9173.64] - - [4096, 3435, 1, 1024] - - [962, 9778.1] + - [965, 9778.1] - - [1024, 3956, 1, 4096] - - [969, 9498.46] + - [972, 9498.46] - - [1024, 3463, 1, 4096] - - [986, 9332.36] + - [989, 9332.36] - - [4096, 3182, 1, 1024] - - [977, 9826.74] + - [980, 9826.74] - - [4096, 3976, 1, 1024] - - [977, 9741.89] + - [980, 9741.89] - - [1024, 3417, 1, 4096] - - [986, 9208.87] + - [989, 9208.87] - - [1024, 3528, 1, 4096] - - [986, 9508.99] + - [989, 9508.99] - - [4096, 3446, 1, 1024] - - [977, 9816.87] + - [980, 9816.87] - - [64, 122, 528, 123] - - [1013, 6325.88] + - [1016, 6325.88] - - [1024, 3543, 1, 4096] - - [986, 9538.63] + - [989, 9538.63] - - [4096, 3287, 1, 1024] - - [966, 9845.94] + - [969, 9845.94] - - [1024, 3499, 1, 4096] - - [986, 9428.41] + - [989, 9428.41] - - [1024, 3231, 1, 4096] - - [979, 8769.81] + - [982, 8769.81] - - [64, 17, 3632, 17] - - [1001, 1934.84] + - [1004, 1934.84] - - [4096, 3519, 1, 1024] - - [966, 9804.28] + - [969, 9804.28] - - [4096, 3552, 1, 1024] - - [966, 9892.55] + - [969, 9892.55] - - [1024, 3458, 1, 4096] - - [986, 9312.18] + - [989, 9312.18] - - [64, 93, 688, 92] - - [1005, 5660.12] + - [1008, 5660.12] - - [1024, 3374, 1, 4096] - - [980, 9110.31] + - [983, 9110.31] - - [1024, 3396, 1, 4096] - - [986, 9145.69] + - [989, 9145.69] - - [1024, 2967, 1, 4096] - - [968, 9364.66] + - [971, 9364.66] - - [64, 19, 3264, 19] - - [1005, 2142.37] + - [1008, 2142.37] - - [4096, 3482, 1, 1024] - - [966, 9714.1] + - [969, 9714.1] - - [64, 32, 1984, 32] - - [1016, 3619.81] + - [1019, 3619.81] - - [64, 102, 624, 99] - - [1007, 5515.23] + - [1010, 5515.23] - - [1024, 3226, 1, 4096] - - [965, 8790.37] + - [968, 8790.37] - - [4096, 3377, 1, 1024] - - [963, 9683.98] + - [966, 9683.98] - - [4096, 3426, 1, 1024] - - [978, 9869.84] + - [981, 9869.84] - - [4096, 2935, 1, 1024] - - [978, 9762.01] + - [981, 9762.01] - - [64, 133, 480, 133] - - [1017, 5891.22] + - [1020, 5891.22] - - [1024, 3439, 1, 4096] - - [986, 9253.89] + - [989, 9253.89] - - [4096, 3267, 1, 1024] - - [966, 9783.8] + - [969, 9783.8] - - [4096, 3499, 1, 1024] - - [967, 9761.01] + - [970, 9761.01] - - [4096, 3356, 1, 1024] - - [978, 9679.34] + - [981, 9679.34] - - [64, 232, 272, 232] - - [1021, 7180.93] + - [1024, 7180.93] - - [64, 162, 400, 159] - - [981, 6444.53] + - [984, 6444.53] - - [4096, 3939, 1, 1024] - - [977, 9877.9] + - [980, 9877.9] - - [1024, 3526, 1, 4096] - - [986, 9508.0] + - [989, 9508.0] - - [1024, 3859, 1, 33708] - - [967, 9402.03] + - [970, 9402.03] - - [1024, 3385, 1, 4096] - - [985, 9107.18] + - [988, 9107.18] - - [1024, 3496, 1, 4096] - - [986, 9417.9] + - [989, 9417.9] - - [4096, 3141, 1, 1024] - - [978, 9682.44] + - [981, 9682.44] - - [4096, 3510, 1, 1024] - - [966, 9786.49] + - [969, 9786.49] - - [1024, 3434, 1, 4096] - - [986, 9246.6] + - [989, 9246.6] - - [4096, 3969, 1, 1024] - - [966, 9714.75] + - [969, 9714.75] - - [1024, 3121, 1, 4096] - - [964, 8464.22] + - [967, 8464.22] - - [1024, 3232, 1, 4096] - - [986, 8711.63] + - [989, 8711.63] - - [1024, 4030, 1, 33708] - - [967, 9816.21] + - [970, 9816.21] - - [1024, 3780, 1, 33708] - - [975, 9315.44] + - [978, 9315.44] - - [1024, 3969, 1, 1024] - - [964, 9248.44] + - [967, 9248.44] - - [4096, 3527, 1, 1024] - - [966, 9832.84] + - [969, 9832.84] - - [4096, 3336, 1, 1024] - - [963, 9623.25] + - [966, 9623.25] - - [4096, 3290, 1, 1024] - - [966, 9852.11] + - [969, 9852.11] - - [64, 9, 6544, 9] - - [1006, 1068.14] + - [1009, 1068.14] - - [1024, 3469, 1, 4096] - - [986, 9350.45] + - [989, 9350.45] - - [4096, 3490, 1, 1024] - - [966, 9737.46] + - [969, 9737.46] - - [4096, 3064, 1, 1024] - - [966, 9889.92] + - [969, 9889.92] - - [4096, 3582, 1, 1024] - - [967, 9961.28] + - [970, 9961.28] - - [1024, 3956, 1, 1024] - - [964, 9294.15] + - [967, 9294.15] - - [4096, 3417, 1, 1024] - - [962, 9811.56] + - [965, 9811.56] - - [1024, 2736, 1, 4096] - - [968, 8636.6] + - [971, 8636.6] - - [64, 78, 816, 78] - - [1005, 4946.0] + - [1008, 4946.0] - - [1024, 3205, 1, 4096] - - [980, 8657.11] + - [983, 8657.11] - - [1024, 3143, 1, 4096] - - [980, 8567.77] + - [983, 8567.77] - - [1024, 4020, 1, 4096] - - [968, 9664.52] + - [971, 9664.52] - - [1024, 3318, 1, 4096] - - [965, 8966.95] + - [968, 8966.95] - - [4096, 3364, 1, 1024] - - [978, 9697.08] + - [981, 9697.08] - - [1024, 3353, 1, 4096] - - [986, 9034.07] + - [989, 9034.07] - - [1024, 3464, 1, 4096] - - [986, 9325.95] + - [989, 9325.95] - - [4096, 3205, 1, 1024] - - [966, 9619.0] + - [969, 9619.0] - - [4096, 3318, 1, 1024] - - [967, 9932.56] + - [970, 9932.56] - - [1024, 3402, 1, 4096] - - [985, 9153.39] + - [988, 9153.39] - - [4096, 3181, 1, 1024] - - [977, 9789.05] + - [980, 9789.05] - - [4096, 3550, 1, 1024] - - [967, 9888.03] + - [970, 9888.03] - - [4096, 3445, 1, 1024] - - [977, 9752.55] + - [980, 9752.55] - - [1024, 3138, 1, 4096] - - [963, 8484.0] + - [966, 8484.0] - - [64, 99, 624, 99] - - [1013, 5323.89] + - [1016, 5323.89] - - [4096, 3079, 1, 1024] - - [963, 9562.16] + - [966, 9562.16] - - [4096, 3144, 1, 1024] - - [977, 9686.56] + - [980, 9686.56] - - [4096, 3860, 1, 1024] - - [978, 9733.32] + - [981, 9733.32] - - [1024, 3515, 1, 4096] - - [986, 9478.34] + - [989, 9478.34] - - [4096, 3408, 1, 1024] - - [963, 9764.86] + - [966, 9764.86] - - [64, 101, 624, 102] - - [1013, 5482.69] + - [1016, 5482.69] - - [1024, 3181, 1, 4096] - - [965, 8593.16] + - [968, 8593.16] - - [4096, 3298, 1, 1024] - - [967, 9867.62] + - [970, 9867.62] - - [4096, 3585, 1, 1024] - - [977, 9632.91] + - [980, 9632.91] - - [1024, 3550, 1, 4096] - - [986, 9564.36] + - [989, 9564.36] - - [1024, 4020, 1, 1024] - - [969, 9339.05] + - [972, 9339.05] - - [4096, 3481, 1, 1024] - - [967, 9713.9] + - [970, 9713.9] - - [4096, 3530, 1, 1024] - - [967, 9833.89] + - [970, 9833.89] - - [4096, 3425, 1, 1024] - - [963, 9675.56] + - [966, 9675.56] - - [4096, 4026, 1, 1024] - - [967, 9849.67] + - [970, 9849.67] - - [1024, 3860, 1, 1024] - - [980, 9073.49] + - [983, 9073.49] - - [4096, 3975, 1, 1024] - - [967, 9737.62] + - [970, 9737.62] - - [1024, 3286, 1, 4096] - - [964, 8884.14] + - [967, 8884.14] - - [1024, 3176, 1, 4096] - - [964, 8597.38] + - [967, 8597.38] - - [1024, 3894, 1, 4096] - - [968, 9359.03] + - [971, 9359.03] - - [4096, 3355, 1, 1024] - - [977, 9692.99] + - [980, 9692.99] - - [4096, 3404, 1, 1024] - - [977, 9786.02] + - [980, 9786.02] - - [1024, 3501, 1, 4096] - - [985, 9426.04] + - [988, 9426.04] - - [4096, 3245, 1, 1024] - - [967, 9723.47] + - [970, 9723.47] - - [1024, 3431, 1, 4096] - - [983, 9244.22] + - [986, 9244.22] - - [1024, 4000, 1, 1024] - - [979, 9343.93] + - [982, 9343.93] - - [4096, 3509, 1, 1024] - - [966, 9781.62] + - [969, 9781.62] - - [4096, 3558, 1, 1024] - - [967, 9905.05] + - [970, 9905.05] - - [1024, 3535, 1, 4096] - - [985, 9519.05] + - [988, 9519.05] - - [1024, 3414, 1, 4096] - - [983, 9197.95] + - [986, 9197.95] - - [1024, 3445, 1, 4096] - - [986, 9279.56] + - [989, 9279.56] - - [1024, 3436, 1, 4096] - - [986, 9259.6] + - [989, 9259.6] - - [4096, 3472, 1, 1024] - - [967, 9685.17] + - [970, 9685.17] - - [1024, 3211, 1, 4096] - - [965, 8708.31] + - [968, 8708.31] - - [64, 7, 8192, 7] - - [1002, 802.816] + - [1005, 802.816] - - [4096, 3383, 1, 1024] - - [977, 9734.72] + - [980, 9734.72] - - [4096, 3448, 1, 1024] - - [978, 9828.44] + - [981, 9828.44] - - [1024, 3343, 1, 4096] - - [979, 9010.36] + - [982, 9010.36] - - [1024, 3518, 1, 4096] - - [986, 9467.92] + - [989, 9467.92] - - [4096, 3289, 1, 1024] - - [967, 9844.06] + - [970, 9844.06] - - [1024, 3440, 1, 4096] - - [982, 9269.42] + - [985, 9269.42] - - [1024, 4032, 1, 33708] - - [966, 9822.31] + - [969, 9822.31] - - [4096, 3489, 1, 1024] - - [966, 9741.93] + - [969, 9741.93] - - [4096, 3346, 1, 1024] - - [963, 9616.64] + - [966, 9616.64] - - [1024, 3534, 1, 4096] - - [985, 9524.19] + - [988, 9524.19] - - [1024, 3079, 1, 4096] - - [980, 8397.67] + - [983, 8397.67] - - [1024, 3955, 1, 4096] - - [967, 9492.15] + - [970, 9492.15] - - [4096, 3236, 1, 1024] - - [967, 9705.93] + - [970, 9705.93] - - [1024, 3545, 1, 4096] - - [985, 9551.87] + - [988, 9551.87] - - [1024, 3144, 1, 4096] - - [979, 8556.7] + - [982, 8556.7] - - [4096, 3780, 1, 1024] - - [966, 9847.5] + - [969, 9847.5] - - [4096, 3163, 1, 1024] - - [977, 9717.69] + - [980, 9717.69] - - [4096, 3468, 1, 1024] - - [967, 9686.39] + - [970, 9686.39] - - [1024, 3539, 1, 4096] - - [986, 9526.89] + - [989, 9526.89] - - [1024, 3541, 1, 4096] - - [986, 9532.76] + - [989, 9532.76] - - [4096, 3363, 1, 1024] - - [962, 9699.0] + - [965, 9699.0] - - [1024, 3475, 1, 4096] - - [986, 9357.0] + - [989, 9357.0] - - [4096, 3110, 1, 1024] - - [978, 9659.58] + - [981, 9659.58] - - [1024, 3509, 1, 4096] - - [985, 9450.49] + - [988, 9450.49] - - [1024, 3413, 1, 4096] - - [986, 9185.81] + - [989, 9185.81] - - [1024, 3975, 1, 1024] - - [964, 9315.42] + - [967, 9315.42] - - [4096, 3549, 1, 1024] - - [967, 9884.72] + - [970, 9884.72] - - [4096, 3342, 1, 1024] - - [977, 9644.27] + - [980, 9644.27] - - [1024, 2985, 1, 4096] - - [967, 9392.07] + - [970, 9392.07] - - [1024, 3876, 1, 33708] - - [966, 9442.22] + - [969, 9442.22] - - [4096, 3280, 1, 1024] - - [966, 9819.92] + - [969, 9819.92] - - [4096, 3191, 1, 1024] - - [978, 9862.08] + - [981, 9862.08] - - [4096, 3512, 1, 1024] - - [967, 9793.11] + - [970, 9793.11] - - [1024, 3560, 1, 4096] - - [983, 9555.45] + - [986, 9555.45] - - [4096, 2499, 1, 1024] - - [967, 9669.35] + - [970, 9669.35] - - [1024, 3248, 1, 4096] - - [964, 8811.84] + - [967, 8811.84] - - [4096, 3423, 1, 1024] - - [978, 9729.67] + - [981, 9729.67] - - [64, 111, 576, 111] - - [1013, 5982.63] + - [1016, 5982.63] - - [4096, 3297, 1, 1024] - - [966, 9865.19] + - [969, 9865.19] - - [4096, 3154, 1, 1024] - - [978, 9613.42] + - [981, 9613.42] - - [1024, 3303, 1, 4096] - - [965, 8951.79] + - [968, 8951.79] - - [1024, 3222, 1, 4096] - - [985, 8682.89] + - [988, 8682.89] - - [1024, 3978, 1, 1024] - - [969, 9234.93] + - [972, 9234.93] - - [4096, 3529, 1, 1024] - - [967, 9831.62] + - [970, 9831.62] - - [4096, 3386, 1, 1024] - - [977, 9755.67] + - [980, 9755.67] - - [64, 134, 480, 134] - - [992, 5990.53] + - [995, 5990.53] - - [1024, 3451, 1, 4096] - - [983, 9277.61] + - [986, 9277.61] - - [4096, 3562, 1, 1024] - - [967, 9908.82] + - [970, 9908.82] - - [4096, 3276, 1, 1024] - - [966, 9818.04] + - [969, 9818.04] - - [64, 135, 480, 132] - - [1021, 6071.77] + - [1024, 6071.77] - - [1024, 3894, 1, 33708] - - [966, 9487.79] + - [969, 9487.79] - - [64, 134, 480, 132] - - [1020, 6091.65] + - [1023, 6091.65] - - [4096, 3540, 1, 1024] - - [967, 9862.79] + - [970, 9862.79] - - [1024, 3416, 1, 4096] - - [985, 9206.17] + - [988, 9206.17] - - [1024, 4005, 1, 33708] - - [966, 9757.19] + - [969, 9757.19] - - [1024, 3942, 1, 4096] - - [969, 9455.75] + - [972, 9455.75] - - [4096, 3403, 1, 1024] - - [977, 9739.36] + - [980, 9739.36] - - [4096, 3381, 1, 1024] - - [978, 9760.04] + - [981, 9760.04] - - [1024, 3492, 1, 4096] - - [982, 9391.69] + - [985, 9391.69] - - [4096, 3101, 1, 1024] - - [978, 9625.92] + - [981, 9625.92] - - [1024, 3430, 1, 4096] - - [986, 9232.04] + - [989, 9232.04] - - [1024, 3977, 1, 4096] - - [969, 9562.9] + - [972, 9562.9] - - [1024, 3640, 1, 4096] - - [968, 8761.4] + - [971, 8761.4] - - [4096, 3557, 1, 1024] - - [967, 9905.42] + - [970, 9905.42] - - [4096, 3414, 1, 1024] - - [963, 9755.39] + - [966, 9755.39] - - [1024, 3391, 1, 4096] - - [986, 9142.56] + - [989, 9142.56] - - [64, 134, 480, 135] - - [995, 5922.05] + - [998, 5922.05] - - [64, 16, 3840, 16] - - [1011, 2080.51] + - [1014, 2080.51] - - [1024, 3356, 1, 4096] - - [986, 9050.99] + - [989, 9050.99] - - [4096, 3320, 1, 1024] - - [967, 9929.47] + - [970, 9929.47] - - [4096, 2765, 1, 1024] - - [967, 9750.18] + - [970, 9750.18] - - [64, 162, 400, 162] - - [984, 6515.19] + - [987, 6515.19] - - [1024, 3411, 1, 4096] - - [986, 9185.62] + - [989, 9185.62] - - [1024, 3978, 1, 4096] - - [966, 9562.67] + - [969, 9562.67] - - [4096, 3487, 1, 1024] - - [967, 9733.75] + - [970, 9733.75] - - [4096, 3520, 1, 1024] - - [966, 9813.85] + - [969, 9813.85] - - [4096, 3942, 1, 1024] - - [977, 9804.29] + - [980, 9804.29] - - [4096, 3431, 1, 1024] - - [962, 9818.96] + - [965, 9818.96] - - [1024, 3271, 1, 4096] - - [979, 8912.98] + - [982, 8912.98] - - [4096, 4020, 1, 1024] - - [966, 9831.32] + - [969, 9831.32] - - [1024, 3481, 1, 4096] - - [982, 9376.05] + - [985, 9376.05] - - [1024, 3419, 1, 4096] - - [985, 9208.58] + - [988, 9208.58] - - [1024, 4059, 1, 4096] - - [969, 9733.73] + - [972, 9733.73] - - [4096, 3345, 1, 1024] - - [978, 9651.33] + - [981, 9651.33] - - [4096, 3394, 1, 1024] - - [978, 9780.33] + - [981, 9780.33] - - [1024, 3298, 1, 4096] - - [985, 8889.53] + - [988, 8889.53] - - [4096, 3235, 1, 1024] - - [967, 9705.71] + - [970, 9705.71] - - [1024, 3681, 1, 33708] - - [974, 9146.12] + - [977, 9146.12] - - [1024, 3840, 1, 4096] - - [967, 9253.85] + - [970, 9253.85] - - [1024, 3362, 1, 4096] - - [986, 9059.71] + - [989, 9059.71] - - [4096, 3467, 1, 1024] - - [966, 9677.41] + - [969, 9677.41] - - [1024, 3349, 1, 4096] - - [986, 9033.97] + - [989, 9033.97] - - [1024, 3460, 1, 4096] - - [986, 9322.84] + - [989, 9322.84] - - [4096, 3214, 1, 1024] - - [967, 9644.36] + - [970, 9644.36] - - [1024, 3398, 1, 4096] - - [986, 9157.19] + - [989, 9157.19] - - [4096, 3478, 1, 1024] - - [966, 9706.56] + - [969, 9706.56] - - [1024, 4050, 1, 33708] - - [966, 9865.04] + - [969, 9865.04] - - [1024, 3244, 1, 4096] - - [982, 8744.43] + - [985, 8744.43] - - [4096, 3341, 1, 1024] - - [978, 9646.69] + - [981, 9646.69] - - [4096, 3454, 1, 1024] - - [963, 9880.46] + - [966, 9880.46] - - [1024, 3166, 1, 4096] - - [980, 8618.36] + - [983, 8618.36] - - [1024, 3425, 1, 4096] - - [986, 9225.22] + - [989, 9225.22] - - [4096, 3295, 1, 1024] - - [967, 9863.71] + - [970, 9863.71] - - [4096, 3072, 1, 1024] - - [966, 9970.99] + - [969, 9970.99] - - [4096, 3822, 1, 1024] - - [967, 9951.97] + - [970, 9951.97] - - [1024, 3681, 1, 4096] - - [968, 8856.84] + - [971, 8856.84] - - [1024, 4050, 1, 4096] - - [968, 9717.48] + - [971, 9717.48] - - [4096, 3495, 1, 1024] - - [966, 9741.04] + - [969, 9741.04] - - [4096, 3560, 1, 1024] - - [967, 9909.04] + - [970, 9909.04] - - [1024, 3524, 1, 4096] - - [985, 9503.1] + - [988, 9503.1] - - [1024, 3942, 1, 33708] - - [966, 9602.57] + - [969, 9602.57] - - [1024, 3304, 1, 4096] - - [965, 8928.66] + - [968, 8928.66] - - [1024, 3387, 1, 4096] - - [986, 9127.55] + - [989, 9127.55] - - [1024, 3498, 1, 4096] - - [985, 9423.29] + - [988, 9423.29] - - [4096, 3458, 1, 1024] - - [966, 9642.53] + - [969, 9642.53] - - [4096, 2967, 1, 1024] - - [966, 9626.61] + - [969, 9626.61] - - [64, 8, 7280, 8] - - [988, 1032.51] + - [991, 1032.51] - - [4096, 3385, 1, 1024] - - [962, 9735.67] + - [965, 9735.67] - - [4096, 3434, 1, 1024] - - [977, 9808.8] + - [980, 9808.8] - - [1024, 3519, 1, 4096] - - [986, 9484.73] + - [989, 9484.73] - - [1024, 3511, 1, 4096] - - [986, 9456.37] + - [989, 9456.37] - - [1024, 3288, 1, 4096] - - [985, 8863.95] + - [988, 8863.95] - - [1024, 2918, 1, 4096] - - [968, 9170.25] + - [971, 9170.25] - - [4096, 3573, 1, 1024] - - [967, 9945.75] + - [970, 9945.75] - - [1024, 3822, 1, 33708] - - [976, 9330.9] + - [979, 9330.9] - - [64, 102, 624, 102] - - [1013, 5531.07] + - [1016, 5531.07] - - [4096, 3539, 1, 1024] - - [967, 9855.29] + - [970, 9855.29] - - [4096, 3332, 1, 1024] - - [978, 9648.87] + - [981, 9648.87] - - [4096, 3286, 1, 1024] - - [967, 9846.32] + - [970, 9846.32] - - [1024, 4026, 1, 4096] - - [968, 9675.84] + - [971, 9675.84] - - [1024, 3277, 1, 4096] - - [982, 8836.11] + - [985, 8836.11] - - [1024, 3471, 1, 4096] - - [986, 9346.23] + - [989, 9346.23] - - [4096, 3518, 1, 1024] - - [967, 9804.1] + - [970, 9804.1] - - [1024, 3393, 1, 4096] - - [986, 9148.89] + - [989, 9148.89] - - [4096, 3413, 1, 1024] - - [963, 9785.07] + - [966, 9785.07] - - [4096, 3303, 1, 1024] - - [967, 9884.27] + - [970, 9884.27] - - [1024, 3207, 1, 4096] - - [964, 8714.59] + - [967, 8714.59] - - [1024, 3894, 1, 1024] - - [980, 9181.41] + - [983, 9181.41] - - [1024, 3977, 1, 1024] - - [980, 9240.8] + - [983, 9240.8] - - [64, 135, 480, 133] - - [995, 5923.3] + - [998, 5923.3] - - [4096, 3535, 1, 1024] - - [967, 9839.45] + - [970, 9839.45] - - [4096, 3376, 1, 1024] - - [962, 9711.92] + - [965, 9711.92] - - [1024, 3355, 1, 4096] - - [986, 9043.17] + - [989, 9043.17] - - [64, 27, 2336, 27] - - [1014, 2929.8] + - [1017, 2929.8] - - [1024, 3466, 1, 4096] - - [986, 9339.0] + - [989, 9339.0] - - [4096, 3266, 1, 1024] - - [967, 9789.19] + - [970, 9789.19] - - [1024, 3404, 1, 4096] - - [986, 9176.66] + - [989, 9176.66] - - [1024, 3999, 1, 1024] - - [979, 9391.81] + - [982, 9391.81] - - [64, 148, 432, 143] - - [992, 6182.82] + - [995, 6182.82] - - [4096, 3498, 1, 1024] - - [966, 9764.46] + - [969, 9764.46] - - [1024, 4032, 1, 1024] - - [964, 9401.93] + - [967, 9401.93] - - [1024, 3410, 1, 4096] - - [985, 9183.4] + - [988, 9183.4] - - [4096, 3393, 1, 1024] - - [978, 9695.39] + - [981, 9695.39] - - [1024, 3140, 1, 4096] - - [979, 8504.76] + - [982, 8504.76] - - [1024, 3910, 1, 33708] - - [966, 9525.96] + - [969, 9525.96] - - [1024, 3334, 1, 4096] - - [985, 8987.49] + - [988, 8987.49] - - [4096, 3140, 1, 1024] - - [978, 9660.61] + - [981, 9660.61] - - [1024, 4005, 1, 4096] - - [969, 9629.78] + - [972, 9629.78] - - [1024, 3579, 1, 4096] - - [985, 9661.35] + - [988, 9661.35] - - [4096, 3372, 1, 1024] - - [978, 9697.22] + - [981, 9697.22] - - [1024, 3245, 1, 4096] - - [979, 8847.66] + - [982, 8847.66] - - [64, 38, 1680, 38] - - [989, 3340.34] + - [992, 3340.34] - - [4096, 3956, 1, 1024] - - [978, 9911.05] + - [981, 9911.05] - - [4096, 3213, 1, 1024] - - [966, 9643.01] + - [969, 9643.01] - - [1024, 3361, 1, 4096] - - [986, 9062.14] + - [989, 9062.14] - - [1024, 3536, 1, 4096] - - [985, 9530.55] + - [988, 9530.55] - - [1024, 3968, 1, 1024] - - [980, 9377.82] + - [983, 9377.82] - - [4096, 3477, 1, 1024] - - [967, 9700.67] + - [970, 9700.67] - - [4096, 3526, 1, 1024] - - [967, 9824.31] + - [970, 9824.31] - - [1024, 4005, 1, 1024] - - [964, 9362.29] + - [967, 9362.29] - - [1024, 3530, 1, 4096] - - [983, 9487.07] + - [986, 9487.07] - - [1024, 3944, 1, 4096] - - [968, 9464.45] + - [971, 9464.45] - - [4096, 3453, 1, 1024] - - [977, 9826.67] + - [980, 9826.67] - - [4096, 3184, 1, 1024] - - [978, 9833.49] + - [981, 9833.49] - - [4096, 3579, 1, 1024] - - [967, 9962.45] + - [970, 9962.45] - - [4096, 3351, 1, 1024] - - [978, 9653.24] + - [981, 9653.24] - - [4096, 3416, 1, 1024] - - [962, 9810.3] + - [965, 9810.3] - - [64, 100, 624, 100] - - [1013, 5408.45] + - [1016, 5408.45] - - [1024, 3822, 1, 4096] - - [968, 9196.1] + - [971, 9196.1] - - [1024, 3796, 1, 4096] - - [968, 9131.86] + - [971, 9131.86] - - [4096, 3257, 1, 1024] - - [966, 9767.24] + - [969, 9767.24] - - [4096, 3306, 1, 1024] - - [966, 9893.25] + - [969, 9893.25] - - [1024, 3505, 1, 4096] - - [986, 9449.92] + - [989, 9449.92] - - [1024, 3315, 1, 4096] - - [979, 8979.67] + - [982, 8979.67] - - [1024, 3486, 1, 4096] - - [985, 9393.38] + - [988, 9393.38] - - [4096, 3457, 1, 1024] - - [966, 9653.09] + - [969, 9653.09] - - [4096, 3870, 1, 1024] - - [963, 9717.41] + - [966, 9717.41] - - [1024, 3447, 1, 4096] - - [986, 9273.04] + - [989, 9273.04] - - [1024, 3558, 1, 4096] - - [983, 9567.23] + - [986, 9567.23] - - [4096, 3433, 1, 1024] - - [963, 9759.16] + - [966, 9759.16] - - [4096, 3180, 1, 1024] - - [978, 9738.53] + - [981, 9738.53] - - [1024, 3213, 1, 4096] - - [964, 8692.15] + - [967, 8692.15] - - [1024, 3900, 1, 4096] - - [968, 9388.51] + - [971, 9388.51] - - [4096, 3444, 1, 1024] - - [977, 9869.63] + - [980, 9869.63] - - [1024, 3504, 1, 4096] - - [986, 9429.28] + - [989, 9429.28] - - [4096, 4059, 1, 1024] - - [967, 9920.69] + - [970, 9920.69] - - [1024, 3442, 1, 4096] - - [986, 9272.91] + - [989, 9272.91] - - [4096, 3517, 1, 1024] - - [966, 9808.09] + - [969, 9808.09] - - [1024, 3566, 1, 4096] - - [985, 9622.79] + - [988, 9622.79] - - [4096, 3248, 1, 1024] - - [966, 9730.23] + - [969, 9730.23] - - [1024, 3547, 1, 4096] - - [985, 9564.63] + - [988, 9564.63] - - [64, 59, 1088, 59] - - [1004, 4611.66] + - [1007, 4611.66] - - [1024, 3340, 1, 4096] - - [985, 8992.11] + - [988, 8992.11] - - [4096, 3480, 1, 1024] - - [967, 9710.07] + - [970, 9710.07] - - [1024, 3968, 1, 4096] - - [967, 9543.01] + - [970, 9543.01] - - [4096, 3424, 1, 1024] - - [963, 9808.56] + - [966, 9808.56] - - [1024, 3906, 1, 1024] - - [965, 9150.44] + - [968, 9150.44] - - [4096, 3265, 1, 1024] - - [966, 9786.75] + - [969, 9786.75] - - [1024, 3384, 1, 4096] - - [986, 9119.46] + - [989, 9119.46] - - [1024, 3494, 1, 4096] - - [983, 9415.42] + - [986, 9415.42] - - [1024, 3236, 1, 4096] - - [980, 8767.04] + - [983, 8767.04] - - [4096, 3497, 1, 1024] - - [967, 9750.76] + - [970, 9750.76] - - [4096, 3354, 1, 1024] - - [978, 9665.07] + - [981, 9665.07] - - [4096, 3055, 1, 1024] - - [967, 9883.99] + - [970, 9883.99] - - [64, 11, 5456, 11] - - [990, 1368.24] + - [993, 1368.24] - - [4096, 3244, 1, 1024] - - [966, 9719.92] + - [969, 9719.92] - - [4096, 3139, 1, 1024] - - [977, 9736.96] + - [980, 9736.96] - - [4096, 3508, 1, 1024] - - [966, 9771.56] + - [969, 9771.56] - - [4096, 4050, 1, 1024] - - [966, 9898.69] + - [969, 9898.69] - - [1024, 3472, 1, 4096] - - [985, 9353.73] + - [988, 9353.73] - - [1024, 3861, 1, 1024] - - [964, 9061.22] + - [967, 9061.22] - - [1024, 3910, 1, 1024] - - [968, 9043.44] + - [971, 9043.44] - - [4096, 3371, 1, 1024] - - [978, 9738.14] + - [981, 9738.14] - - [64, 65, 992, 65] - - [1017, 4354.49] + - [1020, 4354.49] - - [1024, 3751, 1, 4096] - - [967, 9018.64] + - [970, 9018.64] - - [4096, 3325, 1, 1024] - - [966, 9958.63] + - [969, 9958.63] - - [1024, 3321, 1, 4096] - - [986, 8952.45] + - [989, 8952.45] - - [1024, 3944, 1, 1024] - - [965, 9117.25] + - [968, 9117.25] - - [4096, 3525, 1, 1024] - - [967, 9822.04] + - [970, 9822.04] - - [4096, 3382, 1, 1024] - - [978, 9720.11] + - [981, 9720.11] - - [64, 122, 528, 122] - - [1013, 6389.23] + - [1016, 6389.23] - - [1024, 3453, 1, 4096] - - [983, 9304.93] + - [986, 9304.93] - - [4096, 3564, 1, 1024] - - [966, 9911.22] + - [969, 9911.22] - - [4096, 3288, 1, 1024] - - [966, 9841.07] + - [969, 9841.07] - - [1024, 3925, 1, 4096] - - [967, 9418.85] + - [970, 9418.85] - - [1024, 3057, 1, 4096] - - [968, 9590.41] + - [971, 9590.41] - - [4096, 3488, 1, 1024] - - [967, 9732.4] + - [970, 9732.4] - - [4096, 3046, 1, 1024] - - [967, 9850.62] + - [970, 9850.62] - - [1024, 3189, 1, 4096] - - [979, 8676.92] + - [982, 8676.92] - - [4096, 3399, 1, 1024] - - [963, 9672.99] + - [966, 9672.99] - - [1024, 3383, 1, 4096] - - [986, 9102.27] + - [989, 9102.27] - - [1024, 3415, 1, 4096] - - [986, 9216.27] + - [989, 9216.27] - - [1024, 3388, 1, 4096] - - [986, 9127.43] + - [989, 9127.43] - - [1024, 3376, 1, 4096] - - [983, 9090.43] + - [986, 9090.43] - - [1024, 3473, 1, 4096] - - [986, 9354.02] + - [989, 9354.02] - - [4096, 3162, 1, 1024] - - [962, 9694.73] + - [965, 9694.73] - - [1024, 3448, 1, 4096] - - [986, 9283.35] + - [989, 9283.35] - - [4096, 3362, 1, 1024] - - [978, 9673.23] + - [981, 9673.23] - - [64, 228, 272, 228] - - [971, 7039.03] + - [974, 7039.03] - - [1024, 3262, 1, 4096] - - [980, 8850.74] + - [983, 8850.74] - - [1024, 3184, 1, 4096] - - [965, 8625.27] + - [968, 8625.27] - - [1024, 3378, 1, 4096] - - [985, 9105.17] + - [988, 9105.17] - - [4096, 3548, 1, 1024] - - [966, 9877.73] + - [969, 9877.73] - - [4096, 2977, 1, 1024] - - [966, 9647.71] + - [969, 9647.71] - - [64, 21, 2976, 21] - - [1001, 2364.71] + - [1004, 2364.71] - - [64, 112, 576, 111] - - [1000, 5973.58] + - [1003, 5973.58] - - [4096, 3443, 1, 1024] - - [962, 9784.4] + - [965, 9784.4] - - [1024, 3289, 1, 4096] - - [986, 8873.94] + - [989, 8873.94] - - [1024, 3483, 1, 4096] - - [982, 9380.47] + - [985, 9380.47] - - [4096, 3190, 1, 1024] - - [978, 9850.86] + - [981, 9850.86] - - [1024, 3421, 1, 4096] - - [986, 9213.96] + - [989, 9213.96] - - [1024, 3514, 1, 4096] - - [985, 9458.13] + - [988, 9458.13] - - [1024, 3532, 1, 4096] - - [986, 9512.93] + - [989, 9512.93] - - [1024, 3565, 1, 4096] - - [985, 9630.5] + - [988, 9630.5] - - [4096, 3422, 1, 1024] - - [963, 9733.69] + - [966, 9733.69] - - [4096, 3263, 1, 1024] - - [967, 9776.84] + - [970, 9776.84] - - [4096, 3296, 1, 1024] - - [967, 9860.51] + - [970, 9860.51] - - [4096, 3640, 1, 1024] - - [977, 9782.2] + - [980, 9782.2] - - [4096, 3463, 1, 1024] - - [966, 9671.9] + - [969, 9671.9] - - [4096, 3528, 1, 1024] - - [967, 9829.88] + - [970, 9829.88] - - [1024, 3351, 1, 4096] - - [980, 9054.27] + - [983, 9054.27] - - [1024, 3462, 1, 4096] - - [986, 9327.75] + - [989, 9327.75] - - [4096, 3226, 1, 1024] - - [967, 9674.83] + - [970, 9674.83] - - [4096, 3439, 1, 1024] - - [962, 9823.08] + - [965, 9823.08] - - [4096, 3121, 1, 1024] - - [962, 9672.54] + - [965, 9672.54] - - [1024, 4059, 1, 33708] - - [966, 9885.62] + - [969, 9885.62] - - [1024, 3311, 1, 4096] - - [986, 8909.91] + - [989, 8909.91] - - [1024, 3230, 1, 4096] - - [986, 8705.8] + - [989, 8705.8] - - [4096, 3353, 1, 1024] - - [978, 9671.76] + - [981, 9671.76] - - [4096, 3402, 1, 1024] - - [963, 9726.94] + - [966, 9726.94] - - [1024, 3427, 1, 4096] - - [986, 9233.45] + - [989, 9233.45] - - [1024, 3346, 1, 4096] - - [986, 9015.67] + - [989, 9015.67] - - [1024, 3126, 1, 4096] - - [980, 8519.21] + - [983, 8519.21] - - [1024, 3796, 1, 1024] - - [964, 8916.65] + - [967, 8916.65] - - [1024, 3990, 1, 4096] - - [968, 9600.76] + - [971, 9600.76] - - [1024, 3257, 1, 4096] - - [964, 8790.32] + - [967, 8790.32] - - [4096, 3996, 1, 1024] - - [967, 9788.15] + - [970, 9788.15] - - [64, 143, 432, 143] - - [995, 6087.14] + - [998, 6087.14] - - [1024, 3306, 1, 4096] - - [979, 9035.59] + - [982, 9035.59] - - [1024, 3389, 1, 4096] - - [986, 9134.82] + - [989, 9134.82] - - [1024, 3500, 1, 4096] - - [986, 9443.23] + - [989, 9443.23] - - [1024, 3999, 1, 33708] - - [967, 9741.14] + - [970, 9741.14] - - [4096, 3486, 1, 1024] - - [967, 9719.57] + - [970, 9719.57] - - [1024, 3438, 1, 4096] - - [986, 9259.28] + - [989, 9259.28] - - [4096, 3616, 1, 1024] - - [977, 9739.67] + - [980, 9739.67] - - [1024, 3955, 1, 1024] - - [979, 9260.27] + - [982, 9260.27] - - [4096, 3430, 1, 1024] - - [978, 9819.85] + - [981, 9819.85] - - [4096, 3271, 1, 1024] - - [967, 9801.94] + - [970, 9801.94] - - [1024, 3364, 1, 4096] - - [979, 9144.53] + - [982, 9144.53] - - [64, 54, 1184, 54] - - [999, 4315.68] + - [1002, 4315.68] - - [1024, 3497, 1, 4096] - - [986, 9429.32] + - [989, 9429.32] - - [4096, 3503, 1, 1024] - - [966, 9764.38] + - [969, 9764.38] - - [4096, 3344, 1, 1024] - - [963, 9614.06] + - [966, 9614.06] - - [1024, 3457, 1, 4096] - - [986, 9320.5] + - [989, 9320.5] - - [4096, 3466, 1, 1024] - - [966, 9677.71] + - [969, 9677.71] - - [1024, 3976, 1, 33708] - - [967, 9685.28] + - [970, 9685.28] - - [1024, 3395, 1, 4096] - - [985, 9146.29] + - [988, 9146.29] - - [4096, 3361, 1, 1024] - - [977, 9677.79] + - [980, 9677.79] - - [1024, 3751, 1, 33708] - - [975, 9234.59] + - [978, 9234.59] - - [1024, 3822, 1, 1024] - - [964, 8977.73] + - [967, 8977.73] - - [4096, 3315, 1, 1024] - - [967, 9922.44] + - [970, 9922.44] - - [1024, 3163, 1, 4096] - - [979, 8577.69] + - [982, 8577.69] - - [4096, 3547, 1, 1024] - - [967, 9882.82] + - [970, 9882.82] - - [4096, 3340, 1, 1024] - - [977, 9635.32] + - [980, 9635.32] - - [1024, 3296, 1, 4096] - - [986, 8874.56] + - [989, 8874.56] - - [1024, 3468, 1, 4096] - - [986, 9350.16] + - [989, 9350.16] - - [4096, 3294, 1, 1024] - - [966, 9856.77] + - [969, 9856.77] - - [1024, 3406, 1, 4096] - - [982, 9162.74] + - [985, 9162.74] - - [1024, 3860, 1, 33708] - - [966, 9403.46] + - [969, 9403.46] - - [1024, 3584, 1, 4096] - - [983, 9677.34] + - [986, 9677.34] - - [4096, 3189, 1, 1024] - - [978, 9820.59] + - [981, 9820.59] - - [4096, 3494, 1, 1024] - - [966, 9747.58] + - [969, 9747.58] - - [64, 135, 480, 135] - - [992, 5966.24] + - [995, 5966.24] - - [1024, 3093, 1, 4096] - - [980, 8445.96] + - [983, 8445.96] - - [4096, 3421, 1, 1024] - - [963, 9775.93] + - [966, 9775.93] - - [1024, 3479, 1, 4096] - - [986, 9376.44] + - [989, 9376.44] - - [1024, 3433, 1, 4096] - - [986, 9251.04] + - [989, 9251.04] - - [4096, 3311, 1, 1024] - - [966, 9901.43] + - [969, 9901.43] - - [1024, 3381, 1, 4096] - - [986, 9103.89] + - [989, 9103.89] - - [1024, 3996, 1, 4096] - - [967, 9609.46] + - [970, 9609.46] - - [4096, 3384, 1, 1024] - - [977, 9749.91] + - [980, 9749.91] - - [1024, 3247, 1, 4096] - - [965, 8872.49] + - [968, 8872.49] - - [1024, 3169, 1, 4096] - - [964, 8597.51] + - [967, 8597.51] - - [1024, 3088, 1, 4096] - - [980, 8409.97] + - [983, 8409.97] - - [1024, 3363, 1, 4096] - - [986, 9069.4] + - [989, 9069.4] - - [1024, 3538, 1, 4096] - - [985, 9529.58] + - [988, 9529.58] - - [1024, 3996, 1, 1024] - - [969, 9322.96] + - [972, 9322.96] - - [4096, 3169, 1, 1024] - - [963, 9821.3] + - [966, 9821.3] - - [4096, 3538, 1, 1024] - - [966, 9859.32] + - [969, 9859.32] - - [4096, 3401, 1, 1024] - - [963, 9754.4] + - [966, 9754.4] - - [4096, 3581, 1, 1024] - - [966, 9960.61] + - [969, 9960.61] - - [1024, 3180, 1, 4096] - - [964, 8634.95] + - [967, 8634.95] - - [1024, 3870, 1, 1024] - - [965, 9085.59] + - [968, 9085.59] - - [4096, 3555, 1, 1024] - - [966, 9905.64] + - [969, 9905.64] - - [4096, 3412, 1, 1024] - - [978, 9778.46] + - [981, 9778.46] - - [4096, 3302, 1, 1024] - - [966, 9888.61] + - [969, 9888.61] - - [1024, 3561, 1, 4096] - - [982, 9596.95] + - [985, 9596.95] - - [1024, 3302, 1, 4096] - - [986, 8900.77] + - [989, 8900.77] - - [1024, 3976, 1, 4096] - - [968, 9563.12] + - [971, 9563.12] - - [4096, 3485, 1, 1024] - - [966, 9722.47] + - [969, 9722.47] - - [4096, 3534, 1, 1024] - - [966, 9847.12] + - [969, 9847.12] - - [1024, 3110, 1, 4096] - - [979, 8458.46] + - [982, 8458.46] - - [1024, 3401, 1, 4096] - - [986, 9174.71] + - [989, 9174.71] - - [4096, 3216, 1, 1024] - - [966, 9645.39] + - [969, 9645.39] - - [1024, 4020, 1, 33708] - - [966, 9793.51] + - [969, 9793.51] - - [1024, 3215, 1, 4096] - - [986, 8677.41] + - [989, 8677.41] - - [4096, 3566, 1, 1024] - - [966, 9924.68] + - [969, 9924.68] - - [1024, 3137, 1, 4096] - - [964, 8546.97] + - [967, 8546.97] - - [4096, 3359, 1, 1024] - - [963, 9673.63] + - [966, 9673.63] - - [4096, 3392, 1, 1024] - - [978, 9757.41] + - [981, 9757.41] - - [1024, 3506, 1, 4096] - - [986, 9442.9] + - [989, 9442.9] - - [4096, 3233, 1, 1024] - - [966, 9698.6] + - [969, 9698.6] - - [1024, 3444, 1, 4096] - - [986, 9275.44] + - [989, 9275.44] - - [1024, 3975, 1, 4096] - - [967, 9556.77] + - [970, 9556.77] - - [1024, 3870, 1, 33708] - - [966, 9427.34] + - [969, 9427.34] - - [4096, 3465, 1, 1024] - - [967, 9674.91] + - [970, 9674.91] - - [4096, 3968, 1, 1024] - - [963, 9927.83] + - [966, 9927.83] - - [1024, 3523, 1, 4096] - - [986, 9494.05] + - [989, 9494.05] - - [64, 10, 5952, 10] - - [990, 1224.06] + - [993, 1224.06] - - [4096, 3990, 1, 1024] - - [966, 9771.17] + - [969, 9771.17] - - [1024, 3549, 1, 4096] - - [985, 9553.32] + - [988, 9553.32] - - [1024, 3342, 1, 4096] - - [986, 9007.21] + - [989, 9007.21] - - [4096, 3476, 1, 1024] - - [966, 9703.56] + - [969, 9703.56] - - [64, 232, 272, 228] - - [972, 7078.83] + - [975, 7078.83] - - [1024, 3418, 1, 4096] - - [986, 9212.99] + - [989, 9212.99] - - [1024, 3859, 1, 1024] - - [965, 9087.44] + - [968, 9087.44] - - [4096, 3339, 1, 1024] - - [978, 9593.9] + - [981, 9593.9] - - [4096, 3452, 1, 1024] - - [963, 9872.59] + - [966, 9872.59] - - [4096, 3293, 1, 1024] - - [966, 9842.55] + - [969, 9842.55] - - [4096, 3840, 1, 1024] - - [967, 10030.7] + - [970, 10030.7] - - [1024, 3369, 1, 4096] - - [964, 9099.62] + - [967, 9099.62] - - [64, 193, 320, 193] - - [994, 6425.7] + - [997, 6425.7] - - [1024, 3544, 1, 4096] - - [983, 9556.54] + - [986, 9556.54] - - [4096, 3493, 1, 1024] - - [967, 9743.24] + - [970, 9743.24] - - [4096, 3350, 1, 1024] - - [978, 9653.01] + - [981, 9653.01] - - [64, 71, 896, 71] - - [1018, 4686.63] + - [1021, 4686.63] - - [4096, 3256, 1, 1024] - - [966, 9763.68] + - [969, 9763.68] - - [1024, 3870, 1, 4096] - - [968, 9305.18] + - [971, 9305.18] - - [4096, 4012, 1, 1024] - - [967, 9817.25] + - [970, 9817.25] - - [1024, 3280, 1, 4096] - - [986, 8841.92] + - [989, 8841.92] - - [4096, 3456, 1, 1024] - - [962, 9874.33] + - [965, 9874.33] - - [1024, 3555, 1, 4096] - - [985, 9599.53] + - [988, 9599.53] - - [4096, 3014, 1, 1024] - - [966, 9762.18] + - [969, 9762.18] - - [1024, 3474, 1, 4096] - - [986, 9373.57] + - [989, 9373.57] - - [4096, 3367, 1, 1024] - - [962, 9694.54] + - [965, 9694.54] - - [4096, 3432, 1, 1024] - - [978, 9855.17] + - [981, 9855.17] - - [64, 84, 752, 84] - - [1005, 5247.08] + - [1008, 5247.08] - - [4096, 3273, 1, 1024] - - [967, 9801.77] + - [970, 9801.77] - - [4096, 3130, 1, 1024] - - [963, 9672.42] + - [966, 9672.42] - - [1024, 2984, 1, 4096] - - [968, 9403.6] + - [971, 9403.6] - - [1024, 3995, 1, 1024] - - [980, 9392.51] + - [983, 9392.51] - - [1024, 3517, 1, 4096] - - [986, 9481.29] + - [989, 9481.29] - - [1024, 3455, 1, 4096] - - [986, 9302.19] + - [989, 9302.19] - - [1024, 3939, 1, 4096] - - [968, 9469.79] + - [971, 9469.79] - - [64, 49, 1296, 49] - - [998, 3938.86] + - [1001, 3938.86] - - [64, 14, 4368, 14] - - [990, 1802.37] + - [993, 1802.37] - - [64, 25, 2512, 25] - - [1009, 2760.44] + - [1012, 2760.44] - - [4096, 3147, 1, 1024] - - [978, 9712.93] + - [981, 9712.93] - - [4096, 3516, 1, 1024] - - [966, 9805.83] + - [969, 9805.83] - - [1024, 3876, 1, 4096] - - [968, 9320.46] + - [971, 9320.46] - - [1024, 3191, 1, 4096] - - [965, 8640.66] + - [968, 8640.66] - - [4096, 3411, 1, 1024] - - [977, 9737.27] + - [980, 9737.27] - - [1024, 3337, 1, 4096] - - [986, 8990.03] + - [989, 8990.03] - - [1024, 3512, 1, 4096] - - [986, 9459.55] + - [989, 9459.55] - - [4096, 3301, 1, 1024] - - [966, 9877.16] + - [969, 9877.16] - - [1024, 3450, 1, 4096] - - [985, 9283.01] + - [988, 9283.01] - - [4096, 3533, 1, 1024] - - [966, 9848.52] + - [969, 9848.52] - - [4096, 3390, 1, 1024] - - [978, 9764.51] + - [981, 9764.51] - - [4096, 3231, 1, 1024] - - [966, 9693.71] + - [969, 9693.71] - - [1024, 2499, 1, 4096] - - [985, 9304.71] + - [988, 9304.71] - - [1024, 3186, 1, 4096] - - [965, 8649.45] + - [968, 8649.45] - - [1024, 3380, 1, 4096] - - [986, 9101.67] + - [989, 9101.67] - - [4096, 3496, 1, 1024] - - [967, 9754.2] + - [970, 9754.2] - - [1024, 3956, 1, 33708] - - [966, 9636.67] + - [969, 9636.67] - - [1024, 3976, 1, 1024] - - [968, 9248.31] + - [971, 9248.31] - - [4096, 2736, 1, 1024] - - [966, 9651.81] + - [969, 9651.81] - - [1024, 3291, 1, 4096] - - [986, 8868.84] + - [989, 8868.84] - - [1024, 3944, 1, 33708] - - [967, 9606.9] + - [970, 9606.9] - - [1024, 3485, 1, 4096] - - [985, 9385.86] + - [988, 9385.86] - - [4096, 3138, 1, 1024] - - [963, 9672.05] + - [966, 9672.05] - - [1024, 3423, 1, 4096] - - [986, 9222.67] + - [989, 9222.67] - - [1024, 3491, 1, 4096] - - [986, 9404.92] + - [989, 9404.92] - - [1024, 3860, 1, 4096] - - [969, 9282.84] + - [972, 9282.84] - - [4096, 3211, 1, 1024] - - [966, 9640.32] + - [969, 9640.32] - - [1024, 3221, 1, 4096] - - [980, 8709.3] + - [983, 8709.3] - - [1024, 2917, 1, 4096] - - [968, 9177.01] + - [971, 9177.01] - - [4096, 3475, 1, 1024] - - [966, 9703.35] + - [969, 9703.35] - - [4096, 3524, 1, 1024] - - [966, 9816.13] + - [969, 9816.13] - - [4096, 2985, 1, 1024] - - [967, 9686.81] + - [970, 9686.81] - - [1024, 3480, 1, 4096] - - [986, 9380.1] + - [989, 9380.1] - - [4096, 3222, 1, 1024] - - [966, 9666.7] + - [969, 9666.7] - - [4096, 3451, 1, 1024] - - [962, 9877.81] + - [965, 9877.81] - - [1024, 3969, 1, 33708] - - [966, 9669.54] + - [969, 9669.54] - - [1024, 3640, 1, 1024] - - [973, 8565.58] + - [976, 8565.58] - - [1024, 3297, 1, 4096] - - [982, 8889.12] + - [985, 8889.12] - - [4096, 3944, 1, 1024] - - [963, 9902.75] + - [966, 9902.75] - - [1024, 3216, 1, 4096] - - [965, 8695.78] + - [968, 8695.78] - - [1024, 3840, 1, 1024] - - [979, 9045.95] + - [982, 9045.95] - - [4096, 3349, 1, 1024] - - [977, 9676.72] + - [980, 9676.72] - - [4096, 3398, 1, 1024] - - [963, 9775.74] + - [966, 9775.74] - - [1024, 3154, 1, 4096] - - [980, 8662.16] + - [983, 8662.16] - - [1024, 3978, 1, 33708] - - [967, 9689.06] + - [970, 9689.06] - - [1024, 3348, 1, 4096] - - [986, 9014.57] + - [989, 9014.57] - - [4096, 3304, 1, 1024] - - [967, 9886.7] + - [970, 9886.7] - - [4096, 4030, 1, 1024] - - [967, 9859.0] + - [970, 9859.0] - - [1024, 4026, 1, 1024] - - [964, 9326.54] + - [967, 9326.54] - - [4096, 3471, 1, 1024] - - [966, 9682.9] + - [969, 9682.9] - - [1024, 3259, 1, 4096] - - [980, 8792.09] + - [983, 8792.09] - - [64, 132, 480, 132] - - [1020, 6027.76] + - [1023, 6027.76] - - [1024, 3308, 1, 4096] - - [985, 8905.04] + - [988, 8905.04] - - [4096, 3391, 1, 1024] - - [978, 9765.25] + - [981, 9765.25] - - [1024, 3312, 1, 4096] - - [986, 8917.64] + - [989, 8917.64] - - [1024, 3502, 1, 4096] - - [986, 9435.52] + - [989, 9435.52] - - [1024, 3968, 1, 33708] - - [966, 9668.14] + - [969, 9668.14] - - [1024, 3424, 1, 4096] - - [982, 9215.89] + - [985, 9215.89] - - [64, 13, 4672, 13] - - [991, 1662.25] + - [994, 1662.25] - - [4096, 4032, 1, 1024] - - [977, 9877.72] + - [980, 9877.72] - - [1024, 3900, 1, 1024] - - [980, 9116.83] + - [983, 9116.83] - - [4096, 3442, 1, 1024] - - [977, 9773.08] + - [980, 9773.08] - - [1024, 3366, 1, 4096] - - [986, 9079.36] + - [989, 9079.36] - - [4096, 3999, 1, 1024] - - [966, 9786.36] + - [969, 9786.36] - - [1024, 3477, 1, 4096] - - [986, 9364.79] + - [989, 9364.79] - - [1024, 2505, 1, 4096] - - [986, 9303.93] + - [989, 9303.93] - - [4096, 3515, 1, 1024] - - [966, 9797.83] + - [969, 9797.83] - - [1024, 3564, 1, 4096] - - [982, 9632.76] + - [985, 9632.76] - - [4096, 3057, 1, 1024] - - [967, 9880.09] + - [970, 9880.09] - - [1024, 3339, 1, 4096] - - [965, 9029.76] + - [968, 9029.76] - - [4096, 3262, 1, 1024] - - [966, 9780.0] + - [969, 9780.0] - - [1024, 4030, 1, 4096] - - [969, 9681.9] + - [972, 9681.9] - - [1024, 3265, 1, 4096] - - [986, 8797.42] + - [989, 8797.42] - - [1024, 3459, 1, 4096] - - [986, 9312.96] + - [989, 9312.96] - - [4096, 3462, 1, 1024] - - [967, 9669.63] + - [970, 9669.63] - - [64, 85, 752, 85] - - [1005, 5186.83] + - [1008, 5186.83] - - [1024, 3513, 1, 4096] - - [983, 9469.05] + - [986, 9469.05] - - [1024, 3397, 1, 4096] - - [986, 9151.67] + - [989, 9151.67] - - [4096, 3572, 1, 1024] - - [966, 9945.6] + - [969, 9945.6] - - [4096, 3389, 1, 1024] - - [978, 9740.76] + - [981, 9740.76] - - [4096, 3438, 1, 1024] - - [978, 9822.37] + - [981, 9822.37] - - [64, 102, 624, 100] - - [1013, 5486.9] + - [1016, 5486.9] - - [1024, 3640, 1, 33708] - - [974, 9083.43] + - [977, 9083.43] - - [1024, 3995, 1, 33708] - - [967, 9731.89] + - [970, 9731.89] - - [1024, 3165, 1, 4096] - - [979, 8601.8] + - [982, 8601.8] - - [4096, 3543, 1, 1024] - - [967, 9868.53] + - [970, 9868.53] - - [4096, 3352, 1, 1024] - - [962, 9668.34] + - [965, 9668.34] - - [1024, 3359, 1, 4096] - - [983, 9050.23] + - [986, 9050.23] - - [1024, 3470, 1, 4096] - - [986, 9355.07] + - [989, 9355.07] - - [64, 15, 4096, 15] - - [990, 1945.33] + - [993, 1945.33] - - [1024, 3392, 1, 4096] - - [985, 9139.61] + - [988, 9139.61] - - [64, 78, 816, 77] - - [997, 4870.46] + - [1000, 4870.46] - - [4096, 3137, 1, 1024] - - [962, 9600.12] + - [965, 9600.12] - - [4096, 3506, 1, 1024] - - [967, 9778.98] + - [970, 9778.98] - - [1024, 3095, 1, 4096] - - [979, 8381.14] + - [982, 8381.14] - - [1024, 3859, 1, 4096] - - [966, 9288.53] + - [969, 9288.53] - - [4096, 3369, 1, 1024] - - [978, 9697.63] + - [981, 9697.63] - - [64, 45, 1424, 45] - - [1015, 3883.64] + - [1018, 3883.64] - - [1024, 3435, 1, 4096] - - [986, 9264.52] + - [989, 9264.52] - - [1024, 3354, 1, 4096] - - [986, 9035.37] + - [989, 9035.37] - - [1024, 3055, 1, 4096] - - [967, 9597.35] + - [970, 9597.35] - - [4096, 3523, 1, 1024] - - [966, 9821.69] + - [969, 9821.69] - - [4096, 3380, 1, 1024] - - [962, 9721.29] + - [965, 9721.29] - - [1024, 3233, 1, 4096] - - [979, 8724.65] + - [982, 8724.65] - - [4096, 3221, 1, 1024] - - [966, 9660.94] + - [969, 9660.94] - - [4096, 3270, 1, 1024] - - [966, 9797.82] + - [969, 9797.82] - - [4096, 3593, 1, 1024] - - [977, 9679.21] + - [980, 9679.21] - - [1024, 3358, 1, 4096] - - [986, 9051.72] + - [989, 9051.72] - - [1024, 3540, 1, 4096] - - [986, 9533.49] + - [989, 9533.49] - - [4096, 3502, 1, 1024] - - [967, 9760.55] + - [970, 9760.55] - - [4096, 2505, 1, 1024] - - [967, 9680.42] + - [970, 9680.42] - - [4096, 3397, 1, 1024] - - [977, 9785.75] + - [980, 9785.75] - - [1024, 3300, 1, 4096] - - [980, 8907.75] + - [983, 8907.75] - - [4096, 3095, 1, 1024] - - [963, 9618.68] + - [966, 9618.68] - - [1024, 3182, 1, 4096] - - [979, 8606.06] + - [982, 8606.06] - - [1024, 3299, 1, 4096] - - [985, 8885.38] + - [988, 8885.38] - - [1024, 3276, 1, 4096] - - [980, 8872.65] + - [983, 8872.65] - - [1024, 3360, 1, 4096] - - [983, 9044.1] + - [986, 9044.1] - - [4096, 3360, 1, 1024] - - [978, 9681.29] + - [981, 9681.29] - - [4096, 2918, 1, 1024] - - [962, 9732.64] + - [965, 9732.64] - - [1024, 3939, 1, 33708] - - [966, 9595.86] + - [969, 9595.86] - - [4096, 3314, 1, 1024] - - [967, 9914.92] + - [970, 9914.92] - - [1024, 3319, 1, 4096] - - [986, 8956.27] + - [989, 8956.27] - - [64, 35, 1808, 35] - - [1003, 3060.17] + - [1006, 3060.17] - - [1024, 3942, 1, 1024] - - [979, 9211.73] + - [982, 9211.73] - - [1024, 3465, 1, 4096] - - [986, 9340.63] + - [989, 9340.63] - - [4096, 3546, 1, 1024] - - [967, 9875.31] + - [970, 9875.31] - - [1024, 3403, 1, 4096] - - [979, 9224.24] + - [982, 9224.24] - - [1024, 3948, 1, 1024] - - [965, 9245.53] + - [968, 9245.53] - - [4096, 3441, 1, 1024] - - [978, 9758.62] + - [981, 9758.62] - - [1024, 3139, 1, 4096] - - [979, 8582.74] + - [982, 8582.74] - - [1024, 3563, 1, 4096] - - [986, 9620.64] + - [989, 9620.64] - - [1024, 3508, 1, 4096] - - [983, 9449.26] + - [986, 9449.26] - - [1024, 3975, 1, 33708] - - [966, 9683.45] + - [969, 9683.45] - - [1024, 3446, 1, 4096] - - [985, 9289.41] + - [988, 9289.41] - - [1024, 3529, 1, 4096] - - [982, 9491.19] + - [985, 9491.19] - - [64, 112, 576, 112] - - [1007, 6387.04] + - [1010, 6387.04] - - [4096, 3461, 1, 1024] - - [967, 9663.23] + - [970, 9663.23] - - [1024, 3574, 1, 4096] - - [985, 9662.78] + - [988, 9662.78] - - [1024, 3101, 1, 4096] - - [980, 8468.24] + - [983, 8468.24] - - [1024, 3927, 1, 1024] - - [965, 9207.87] + - [968, 9207.87] - - [4096, 3224, 1, 1024] - - [967, 9665.51] + - [970, 9665.51] - - [4096, 3437, 1, 1024] - - [963, 9857.11] + - [966, 9857.11] - - [4096, 3900, 1, 1024] - - [978, 9826.15] + - [981, 9826.15] - - [1024, 3495, 1, 4096] - - [986, 9412.31] + - [989, 9412.31] - - [1024, 3977, 1, 33708] - - [966, 9687.77] + - [969, 9687.77] - - [1024, 3328, 1, 4096] - - [986, 8975.47] + - [989, 8975.47] - - [4096, 3168, 1, 1024] - - [962, 9754.77] + - [965, 9754.77] - - [1024, 4026, 1, 33708] - - [966, 9807.14] + - [969, 9807.14] - - [1024, 3292, 1, 4096] - - [979, 8901.73] + - [982, 8901.73] - - [1024, 3294, 1, 4096] - - [986, 8876.93] + - [989, 8876.93] - - [4096, 3335, 1, 1024] - - [963, 9616.13] + - [966, 9616.13] - - [4096, 3400, 1, 1024] - - [977, 9710.63] + - [980, 9710.63] - - [1024, 3287, 1, 4096] - - [964, 8907.97] + - [967, 8907.97] - - [1024, 3910, 1, 4096] - - [968, 9400.93] + - [971, 9400.93] - - [1024, 3780, 1, 1024] - - [979, 8863.19] + - [982, 8863.19] - - [4096, 3098, 1, 1024] - - [963, 9606.37] + - [966, 9606.37] - - [1024, 3584, 1, 33708] - - [986, 9775.23] + - [989, 9775.23] - - [64, 29, 2176, 29] - - [1008, 3134.93] + - [1011, 3134.93] - - [1024, 3371, 1, 4096] - - [964, 9117.71] + - [967, 9117.71] - - [1024, 3546, 1, 4096] - - [986, 9547.2] + - [989, 9547.2] - - [1024, 4012, 1, 1024] - - [968, 9353.63] + - [971, 9353.63] - - [4096, 3505, 1, 1024] - - [966, 9773.07] + - [969, 9773.07] - - [4096, 3554, 1, 1024] - - [966, 9895.49] + - [969, 9895.49] - - [4096, 3063, 1, 1024] - - [966, 9898.88] + - [969, 9898.88] - - [1024, 3900, 1, 33708] - - [967, 9502.83] + - [970, 9502.83] - - [1024, 3345, 1, 4096] - - [986, 9015.75] + - [989, 9015.75] - - [1024, 3357, 1, 4096] - - [986, 9041.13] + - [989, 9041.13] - - [1024, 3282, 1, 4096] - - [979, 8860.07] + - [982, 8860.07] - - [4096, 3484, 1, 1024] - - [967, 9721.23] + - [970, 9721.23] - - [1024, 3557, 1, 4096] - - [983, 9573.38] + - [986, 9573.38] - - [1024, 3476, 1, 4096] - - [986, 9361.62] + - [989, 9361.62] - - [1024, 3751, 1, 1024] - - [980, 8849.01] + - [983, 8849.01] - - [4096, 3379, 1, 1024] - - [963, 9741.39] + - [966, 9741.39] - - [4096, 3428, 1, 1024] - - [962, 9767.72] + - [965, 9767.72] - - [4096, 3126, 1, 1024] - - [977, 9701.8] + - [980, 9701.8] - - [64, 41, 1552, 41] - - [1012, 3555.59] + - [1015, 3555.59] - - [1024, 3325, 1, 4096] - - [964, 8962.31] + - [967, 8962.31] - - [4096, 3501, 1, 1024] - - [966, 9761.91] + - [969, 9761.91] - - [4096, 3358, 1, 1024] - - [962, 9680.32] + - [965, 9680.32] - - [1024, 3441, 1, 4096] - - [986, 9271.17] + - [989, 9271.17] - - [1024, 3552, 1, 4096] - - [982, 9565.32] + - [985, 9565.32] - - [4096, 3232, 1, 1024] - - [967, 9696.71] + - [970, 9696.71] - - [64, 18, 3440, 18] - - [987, 2059.23] + - [990, 2059.23] - - [1024, 3412, 1, 4096] - - [986, 9199.18] + - [989, 9199.18] - - [1024, 3372, 1, 4096] - - [983, 9083.39] + - [986, 9083.39] - - [1024, 3585, 1, 4096] - - [973, 8710.19] + - [976, 8710.19] - - [4096, 3143, 1, 1024] - - [978, 9692.02] + - [981, 9692.02] - - [4096, 3464, 1, 1024] - - [966, 9661.83] + - [969, 9661.83] - - [1024, 3145, 1, 4096] - - [965, 8526.23] + - [968, 8526.23] - - [4096, 3375, 1, 1024] - - [977, 9734.68] + - [980, 9734.68] - - [4096, 2917, 1, 1024] - - [962, 9714.47] + - [965, 9714.47] - - [4096, 3978, 1, 1024] - - [967, 9741.33] + - [970, 9741.33] - - [1024, 2765, 1, 4096] - - [968, 8706.65] + - [971, 8706.65] - - [64, 148, 432, 148] - - [993, 6372.07] + - [996, 6372.07] - - [1024, 3452, 1, 4096] - - [985, 9301.28] + - [988, 9301.28] - - [4096, 3584, 1, 1024] - - [967, 10005.6] + - [970, 10005.6] - - [4096, 3545, 1, 1024] - - [967, 9877.77] + - [970, 9877.77] - - [1024, 3352, 1, 4096] - - [986, 9035.09] + - [989, 9035.09] - - [64, 159, 400, 160] - - [995, 6952.01] + - [998, 6952.01] - - [4096, 3292, 1, 1024] - - [966, 9856.41] + - [969, 9856.41] - - [1024, 3525, 1, 4096] - - [986, 9501.4] + - [989, 9501.4] - - [1024, 3266, 1, 4096] - - [986, 8817.33] + - [989, 8817.33] - - [1024, 3382, 1, 4096] - - [985, 9101.44] + - [988, 9101.44] - - [4096, 3492, 1, 1024] - - [966, 9747.19] + - [969, 9747.19] - - [4096, 3419, 1, 1024] - - [978, 9745.78] + - [981, 9745.78] - - [1024, 3796, 1, 33708] - - [975, 9356.16] + - [978, 9356.16] - - [1024, 3293, 1, 4096] - - [982, 8868.3] + - [985, 8868.3] - - [4096, 3796, 1, 1024] - - [967, 9885.26] + - [970, 9885.26] - - [1024, 3487, 1, 4096] - - [983, 9391.24] + - [986, 9391.24] - - [4096, 3166, 1, 1024] - - [978, 9718.36] + - [981, 9718.36] - - [64, 102, 624, 101] - - [1007, 5547.74] + - [1010, 5547.74] - - [1024, 3409, 1, 4096] - - [986, 9187.78] + - [989, 9187.78] - - [1024, 3520, 1, 4096] - - [985, 9484.99] + - [988, 9484.99] - - [1024, 3573, 1, 4096] - - [986, 9652.61] + - [989, 9652.61] - - [4096, 3366, 1, 1024] - - [962, 9684.21] + - [965, 9684.21] - - [4096, 3720, 1, 1024] - - [978, 9703.24] + - [981, 9703.24] - - [4096, 3207, 1, 1024] - - [966, 9626.11] + - [969, 9626.11] - - [4096, 3272, 1, 1024] - - [966, 9795.41] + - [969, 9795.41] - - [1024, 3390, 1, 4096] - - [986, 9125.78] + - [989, 9125.78] - - [4096, 3183, 1, 1024] - - [978, 9825.77] + - [981, 9825.77] - - [4096, 3536, 1, 1024] - - [967, 9846.41] + - [970, 9846.41] - - [4096, 3563, 1, 1024] - - [967, 9913.7] + - [970, 9913.7] - - [1024, 3482, 1, 4096] - - [986, 9376.81] + - [989, 9376.81] - - [4096, 3447, 1, 1024] - - [977, 9874.99] + - [980, 9874.99] - - [4096, 3955, 1, 1024] - - [962, 9922.29] + - [965, 9922.29] - - [4096, 4005, 1, 1024] - - [967, 9803.33] + - [970, 9803.33] - - [1024, 3493, 1, 4096] - - [986, 9411.27] + - [989, 9411.27] - - [4096, 3410, 1, 1024] - - [962, 9788.24] + - [965, 9788.24] - - [1024, 3422, 1, 4096] - - [985, 9216.18] + - [988, 9216.18] - - [1024, 3350, 1, 4096] - - [980, 9067.92] + - [983, 9067.92] - - [4096, 3300, 1, 1024] - - [967, 9883.19] + - [970, 9883.19] - - [4096, 3910, 1, 1024] - - [977, 9800.02] + - [980, 9800.02] - - [1024, 3489, 1, 4096] - - [986, 9398.56] + - [989, 9398.56] - - [4096, 3483, 1, 1024] - - [966, 9715.86] + - [969, 9715.86] - - [4096, 3532, 1, 1024] - - [967, 9837.89] + - [970, 9837.89] - - [64, 101, 624, 101] - - [1007, 5452.18] + - [1010, 5452.18] - - [4096, 3230, 1, 1024] - - [967, 9683.5] + - [970, 9683.5] - - [4096, 3427, 1, 1024] - - [962, 9760.62] + - [965, 9760.62] - - [1024, 3377, 1, 4096] - - [986, 9101.07] + - [989, 9101.07] - - [1024, 3488, 1, 4096] - - [985, 9381.89] + - [988, 9381.89] - - [1024, 3616, 1, 4096] - - [968, 8709.23] + - [971, 8709.23] - - [1024, 3426, 1, 4096] - - [986, 9229.33] + - [989, 9229.33] - - [4096, 3357, 1, 1024] - - [978, 9668.4] + - [981, 9668.4] - - [4096, 3406, 1, 1024] - - [963, 9748.47] + - [966, 9748.47] - - [1024, 3046, 1, 4096] - - [968, 9590.33] + - [971, 9590.33] - - [1024, 3272, 1, 4096] - - [979, 8930.1] + - [982, 8930.1] - - [1024, 3256, 1, 4096] - - [964, 8828.06] + - [967, 8828.06] - - [4096, 3247, 1, 1024] - - [966, 9741.71] + - [969, 9741.71] - - [4096, 3088, 1, 1024] - - [978, 9588.97] + - [981, 9588.97] - - [1024, 3531, 1, 4096] - - [985, 9500.96] + - [988, 9500.96] - - [64, 160, 400, 160] - - [1021, 7333.93] + - [1024, 7333.93] - - [4096, 3511, 1, 1024] - - [967, 9789.28] + - [970, 9789.28] - - [1024, 3720, 1, 33708] - - [976, 9214.58] + - [979, 9214.58] - - [1024, 3267, 1, 4096] - - [979, 8830.94] + - [982, 8830.94] - - [1024, 3270, 1, 4096] - - [980, 8876.58] + - [983, 8876.58] - - [1024, 3461, 1, 4096] - - [985, 9327.45] + - [988, 9327.45] - - [4096, 3474, 1, 1024] - - [966, 9696.94] + - [969, 9696.94] - - [4096, 2984, 1, 1024] - - [967, 9673.98] + - [970, 9673.98] - - [1024, 3399, 1, 4096] - - [985, 9158.48] + - [988, 9158.48] - - [4096, 3574, 1, 1024] - - [966, 9942.2] + - [969, 9942.2] - - [1024, 3876, 1, 1024] - - [980, 9085.03] + - [983, 9085.03] - - [4096, 3337, 1, 1024] - - [963, 9611.33] + - [966, 9611.33] - - [4096, 3450, 1, 1024] - - [978, 9930.25] + - [981, 9930.25] - - [1024, 3720, 1, 1024] - - [964, 8755.39] + - [967, 8755.39] - - [1024, 4059, 1, 1024] - - [969, 9366.57] + - [972, 9366.57] - - [4096, 3291, 1, 1024] - - [966, 9856.23] + - [969, 9856.23] - - [64, 93, 688, 93] - - [1010, 5497.01] + - [1013, 5497.01] - - [4096, 3995, 1, 1024] - - [966, 9776.57] + - [969, 9776.57] - - [64, 147, 432, 147] - - [996, 6233.78] + - [999, 6233.78] - - [4096, 3491, 1, 1024] - - [966, 9742.84] + - [969, 9742.84] - - [4096, 3348, 1, 1024] - - [978, 9634.01] + - [981, 9634.01] - - [4096, 3925, 1, 1024] - - [977, 9848.44] + - [980, 9848.44] - - [4096, 3894, 1, 1024] - - [977, 9812.45] + - [980, 9812.45] - - [1024, 3456, 1, 4096] - - [986, 9317.81] + - [989, 9317.81] - - [1024, 3394, 1, 4096] - - [985, 9148.76] + - [988, 9148.76] - - [64, 100, 624, 102] - - [1007, 5416.85] + - [1010, 5416.85] - - [4096, 3165, 1, 1024] - - [977, 9743.25] + - [980, 9743.25] - - [4096, 3470, 1, 1024] - - [967, 9690.94] + - [970, 9690.94] - - [1024, 3014, 1, 4096] - - [968, 9486.16] + - [971, 9486.16] - - [1024, 3375, 1, 4096] - - [986, 9082.61] + - [989, 9082.61] - - [4096, 3859, 1, 1024] - - [977, 9738.77] + - [980, 9738.77] - - [4096, 3365, 1, 1024] - - [978, 9694.64] + - [981, 9694.64] - - [1024, 3162, 1, 4096] - - [979, 8550.21] + - [982, 8550.21] - - [1024, 3840, 1, 33708] - - [976, 9408.98] + - [979, 9408.98] - - [1024, 3437, 1, 4096] - - [986, 9270.39] + - [989, 9270.39] - - [4096, 3319, 1, 1024] - - [967, 9927.05] + - [970, 9927.05] - - [1024, 3320, 1, 4096] - - [986, 8962.19] + - [989, 8962.19] - - [64, 23, 2720, 23] - - [1009, 2569.43] + - [1012, 2569.43] - - [4096, 3328, 1, 1024] - - [966, 9997.31] + - [969, 9997.31] - - [1024, 3235, 1, 4096] - - [986, 8724.21] + - [989, 8724.21] - - [4096, 3282, 1, 1024] - - [967, 9827.03] + - [970, 9827.03] - - [1024, 3367, 1, 4096] - - [979, 9083.92] + - [982, 9083.92] - - [1024, 3542, 1, 4096] - - [986, 9533.0] + - [989, 9533.0] - - [64, 177, 352, 177] - - [972, 6817.81] + - [975, 6817.81] - - [4096, 3145, 1, 1024] - - [963, 9710.18] + - [966, 9710.18] - - [4096, 3514, 1, 1024] - - [966, 9792.96] + - [969, 9792.96] - - [1024, 3432, 1, 4096] - - [986, 9249.29] + - [989, 9249.29] - - [4096, 3409, 1, 1024] - - [962, 9721.5] + - [965, 9721.5] - - [1024, 4012, 1, 33708] - - [966, 9773.25] + - [969, 9773.25] - - [4096, 3876, 1, 1024] - - [963, 9745.55] + - [966, 9745.55] - - [4096, 3299, 1, 1024] - - [966, 9873.43] + - [969, 9873.43] - - [1024, 3168, 1, 4096] - - [979, 8597.03] + - [982, 8597.03] - - [4096, 3681, 1, 1024] - - [978, 9839.93] + - [981, 9839.93] - - [4096, 3531, 1, 1024] - - [967, 9847.66] + - [970, 9847.66] - - [4096, 3388, 1, 1024] - - [978, 9772.18] + - [981, 9772.18] - - [1024, 3720, 1, 4096] - - [967, 8951.5] + - [970, 8951.5] - - [1024, 3332, 1, 4096] - - [986, 8978.87] + - [989, 8978.87] - - [1024, 3273, 1, 4096] - - [980, 8982.39] + - [983, 8982.39] - - [1024, 2935, 1, 4096] - - [969, 9224.79] + - [972, 9224.79] - - [1024, 3467, 1, 4096] - - [983, 9329.23] + - [986, 9329.23] - - [4096, 3542, 1, 1024] - - [966, 9858.41] + - [969, 9858.41] - - [1024, 3130, 1, 4096] - - [965, 8526.56] + - [968, 8526.56] - - [1024, 3405, 1, 4096] - - [986, 9163.34] + - [989, 9163.34] - - [1024, 3960, 1, 1024] - - [964, 9280.26] + - [967, 9280.26] - - [4096, 3405, 1, 1024] - - [977, 9710.1] + - [980, 9710.1] - - [512, 512, 1, 1024] - - [1163, 6670.86] + - [1166, 6670.86] - - [8, 500, 1, 512] - - [1059, 228.571] + - [1062, 228.571] - - [512, 512, 1, 2000] - - [1196, 7629.34] + - [1199, 7629.34] - - [32, 512, 1, 512] - - [1056, 903.945] + - [1059, 903.945] - - [100, 1024, 1, 2048] - - [1118, 3196.88] + - [1121, 3196.88] - - [8, 512, 1, 500] - - [1049, 237.037] + - [1052, 237.037] - - [8, 500, 1, 1024] - - [1113, 289.266] + - [1116, 289.266] - - [100, 2000, 1, 1024] - - [1152, 3368.42] + - [1155, 3368.42] - - [64, 1024, 1, 100] - - [1051, 941.609] + - [1054, 941.609] - - [64, 1024, 1, 500] - - [1178, 2659.74] + - [1181, 2659.74] - - [64, 1024, 1, 1024] - - [1116, 2452.81] + - [1119, 2452.81] - - [128, 2000, 1, 100] - - [1172, 2560.0] + - [1175, 2560.0] - - [2, 500, 1, 2048] - - [1113, 72.1127] + - [1116, 72.1127] - - [16, 512, 1, 10] - - [1027, 18.2857] + - [1030, 18.2857] - - [64, 2000, 1, 1024] - - [1183, 2800.68] + - [1186, 2800.68] - - [100, 1024, 1, 1024] - - [1111, 3034.07] + - [1114, 3034.07] - - [8, 512, 1, 10] - - [1089, 9.14286] + - [1092, 9.14286] - - [16, 500, 1, 2048] - - [1113, 565.746] + - [1116, 565.746] - - [10, 100, 1, 500] - - [1049, 58.4112] + - [1052, 58.4112] - - [16, 100, 1, 10] - - [1089, 3.57143] + - [1092, 3.57143] - - [500, 1024, 1, 512] - - [1179, 6514.51] + - [1182, 6514.51] - - [128, 1024, 1, 512] - - [1197, 4194.3] + - [1200, 4194.3] - - [512, 500, 1, 2000] - - [1155, 7347.88] + - [1158, 7347.88] - - [2, 100, 1, 2000] - - [1049, 20.8333] + - [1052, 20.8333] - - [500, 512, 1, 100] - - [1171, 2539.68] + - [1174, 2539.68] - - [100, 1024, 1, 500] - - [1197, 3216.08] + - [1200, 3216.08] - - [256, 100, 1, 2048] - - [1207, 1689.07] + - [1210, 1689.07] - - [2, 512, 1, 512] - - [1063, 50.4123] + - [1066, 50.4123] - - [128, 2000, 1, 512] - - [1183, 4641.36] + - [1186, 4641.36] - - [2, 100, 1, 10] - - [1027, 0.396825] + - [1030, 0.396825] - - [16, 2000, 1, 2048] - - [1071, 1266.15] + - [1074, 1266.15] - - [200, 100, 1, 100] - - [1217, 316.456] + - [1220, 316.456] - - [256, 1024, 1, 100] - - [1173, 2685.9] + - [1176, 2685.9] - - [200, 500, 1, 1024] - - [1222, 3282.05] + - [1225, 3282.05] - - [500, 100, 1, 100] - - [1136, 631.313] + - [1139, 631.313] - - [4, 100, 1, 10] - - [1034, 0.877193] + - [1037, 0.877193] - - [32, 100, 1, 512] - - [1113, 198.835] + - [1116, 198.835] - - [100, 2000, 1, 512] - - [1183, 3832.34] + - [1186, 3832.34] - - [16, 1024, 1, 512] - - [1097, 794.376] + - [1100, 794.376] - - [200, 512, 1, 100] - - [1215, 1306.12] + - [1218, 1306.12] - - [4, 1024, 1, 1024] - - [1056, 213.125] + - [1059, 213.125] - - [512, 1024, 1, 512] - - [1180, 7049.25] + - [1183, 7049.25] - - [4, 512, 1, 10] - - [1088, 4.49123] + - [1091, 4.49123] - - [2, 2048, 1, 2000] - - [1049, 300.293] + - [1052, 300.293] - - [64, 2048, 1, 10] - - [1209, 240.941] + - [1212, 240.941] - - [128, 100, 1, 10] - - [1214, 27.5862] + - [1217, 27.5862] - - [4, 512, 1, 2048] - - [1049, 146.449] + - [1052, 146.449] - - [64, 2048, 1, 500] - - [1189, 4015.69] + - [1192, 4015.69] - - [512, 512, 1, 512] - - [1144, 6123.07] + - [1147, 6123.07] - - [500, 500, 1, 2000] - - [1155, 7126.57] + - [1158, 7126.57] - - [10, 1024, 1, 2000] - - [1122, 807.571] + - [1125, 807.571] - - [256, 100, 1, 100] - - [1134, 296.296] + - [1137, 296.296] - - [32, 2000, 1, 2048] - - [1077, 2167.2] + - [1080, 2167.2] - - [64, 1024, 1, 2048] - - [1110, 2383.13] + - [1113, 2383.13] - - [200, 2048, 1, 512] - - [1185, 5263.94] + - [1188, 5263.94] - - [256, 500, 1, 10] - - [1167, 210.526] + - [1170, 210.526] - - [16, 1024, 1, 100] - - [1047, 262.564] + - [1050, 262.564] - - [32, 1024, 1, 1024] - - [1052, 1476.87] + - [1055, 1476.87] - - [512, 500, 1, 512] - - [1141, 5851.43] + - [1144, 5851.43] - - [128, 1024, 1, 2000] - - [1225, 5516.5] + - [1228, 5516.5] - - [8, 100, 1, 500] - - [1049, 46.2963] + - [1052, 46.2963] - - [100, 2000, 1, 2048] - - [1204, 3715.53] + - [1207, 3715.53] - - [10, 512, 1, 512] - - [1059, 292.571] + - [1062, 292.571] - - [8, 500, 1, 10] - - [1088, 8.77193] + - [1091, 8.77193] - - [10, 2000, 1, 1024] - - [1102, 640.0] + - [1105, 640.0] - - [16, 1024, 1, 10] - - [1087, 36.5714] + - [1090, 36.5714] - - [16, 512, 1, 2048] - - [1066, 585.797] + - [1069, 585.797] - - [256, 512, 1, 10] - - [1132, 230.761] + - [1135, 230.761] - - [2, 2000, 1, 100] - - [1094, 64.1026] + - [1097, 64.1026] - - [128, 512, 1, 2048] - - [1061, 3106.89] + - [1064, 3106.89] - - [128, 512, 1, 100] - - [1054, 952.558] + - [1057, 952.558] - - [512, 2000, 1, 1024] - - [1151, 8065.97] + - [1154, 8065.97] - - [64, 500, 1, 2048] - - [1220, 1857.6] + - [1223, 1857.6] - - [64, 2000, 1, 2048] - - [1202, 3442.02] + - [1205, 3442.02] - - [64, 2048, 1, 512] - - [1203, 3315.66] + - [1206, 3315.66] - - [10, 2000, 1, 512] - - [1049, 785.276] + - [1052, 785.276] - - [32, 2000, 1, 500] - - [1052, 2500.0] + - [1055, 2500.0] - - [64, 2000, 1, 10] - - [1040, 231.884] + - [1043, 231.884] - - [500, 100, 1, 10] - - [1137, 88.0282] + - [1140, 88.0282] - - [128, 1024, 1, 500] - - [1188, 4096.0] + - [1191, 4096.0] - - [64, 100, 1, 2048] - - [1049, 587.24] + - [1052, 587.24] - - [64, 100, 1, 10] - - [1208, 11.9403] + - [1211, 11.9403] - - [16, 512, 1, 500] - - [1059, 461.261] + - [1062, 461.261] - - [32, 2000, 1, 1024] - - [1046, 1713.81] + - [1049, 1713.81] - - [200, 512, 1, 1024] - - [1225, 3244.36] + - [1228, 3244.36] - - [128, 2048, 1, 10] - - [1041, 455.111] + - [1044, 455.111] - - [200, 100, 1, 2000] - - [1049, 1461.99] + - [1052, 1461.99] - - [2, 100, 1, 512] - - [1049, 12.4272] + - [1052, 12.4272] - - [64, 2048, 1, 100] - - [1215, 1689.07] + - [1218, 1689.07] - - [32, 512, 1, 100] - - [1048, 265.974] + - [1051, 265.974] - - [16, 512, 1, 1024] - - [1113, 569.878] + - [1116, 569.878] - - [4, 1024, 1, 512] - - [1103, 208.051] + - [1106, 208.051] - - [64, 2000, 1, 100] - - [1215, 1649.48] + - [1218, 1649.48] - - [512, 2048, 1, 512] - - [1151, 7848.99] + - [1154, 7848.99] - - [2, 500, 1, 500] - - [1037, 53.4188] + - [1040, 53.4188] - - [32, 100, 1, 100] - - [1048, 57.1429] + - [1051, 57.1429] - - [100, 500, 1, 2000] - - [1052, 2783.96] + - [1055, 2783.96] - - [200, 2000, 1, 100] - - [1124, 2994.01] + - [1127, 2994.01] - - [10, 512, 1, 10] - - [1084, 11.0345] + - [1087, 11.0345] - - [100, 500, 1, 2048] - - [1224, 2361.62] + - [1227, 2361.62] - - [4, 2048, 1, 500] - - [1059, 379.259] + - [1062, 379.259] - - [200, 500, 1, 100] - - [1185, 1288.66] + - [1188, 1288.66] - - [500, 500, 1, 500] - - [1141, 5425.35] + - [1144, 5425.35] - - [2, 100, 1, 1024] - - [1113, 16.2025] + - [1116, 16.2025] - - [128, 2048, 1, 512] - - [1199, 4699.5] + - [1202, 4699.5] - - [200, 2000, 1, 1024] - - [1149, 4620.94] + - [1152, 4620.94] - - [32, 512, 1, 1024] - - [1112, 1028.02] + - [1115, 1028.02] - - [100, 2048, 1, 500] - - [1173, 4142.39] + - [1176, 4142.39] - - [256, 100, 1, 1024] - - [1203, 1443.52] + - [1206, 1443.52] - - [16, 2000, 1, 500] - - [1098, 1428.57] + - [1101, 1428.57] - - [128, 100, 1, 100] - - [1048, 213.333] + - [1051, 213.333] - - [500, 500, 1, 2048] - - [1145, 6639.0] + - [1148, 6639.0] - - [32, 512, 1, 10] - - [1081, 35.9298] + - [1084, 35.9298] - - [128, 100, 1, 1024] - - [1109, 791.498] + - [1112, 791.498] - - [16, 500, 1, 2000] - - [1122, 694.444] + - [1125, 694.444] - - [4, 2048, 1, 100] - - [1093, 129.62] + - [1096, 129.62] - - [64, 500, 1, 500] - - [1035, 1333.33] + - [1038, 1333.33] - - [500, 1024, 1, 2048] - - [1154, 7031.76] + - [1157, 7031.76] - - [512, 2048, 1, 100] - - [1129, 5285.16] + - [1132, 5285.16] - - [128, 512, 1, 1024] - - [1221, 2519.1] + - [1224, 2519.1] - - [128, 512, 1, 2000] - - [1219, 3608.81] + - [1222, 3608.81] - - [128, 2000, 1, 2000] - - [1192, 7017.54] + - [1195, 7017.54] - - [2, 512, 1, 10] - - [1085, 2.03175] + - [1088, 2.03175] - - [10, 512, 1, 500] - - [1049, 293.578] + - [1052, 293.578] - - [4, 1024, 1, 2000] - - [1069, 326.115] + - [1072, 326.115] - - [256, 100, 1, 2000] - - [1206, 1767.96] + - [1209, 1767.96] - - [512, 2048, 1, 2000] - - [1151, 8674.52] + - [1154, 8674.52] - - [100, 100, 1, 10] - - [1213, 21.5517] + - [1216, 21.5517] - - [256, 500, 1, 1024] - - [1153, 4833.04] + - [1156, 4833.04] - - [128, 512, 1, 10] - - [1041, 132.129] + - [1044, 132.129] - - [256, 100, 1, 500] - - [1200, 914.286] + - [1203, 914.286] - - [64, 100, 1, 512] - - [1107, 369.009] + - [1110, 369.009] - - [64, 512, 1, 500] - - [1049, 1600.0] + - [1052, 1600.0] - - [64, 2048, 1, 2000] - - [1203, 5925.5] + - [1206, 5925.5] - - [100, 2048, 1, 1024] - - [1161, 3260.5] + - [1164, 3260.5] - - [200, 2000, 1, 10] - - [1041, 595.238] + - [1044, 595.238] - - [128, 1024, 1, 100] - - [1185, 1689.07] + - [1188, 1689.07] - - [16, 2000, 1, 100] - - [1048, 493.827] + - [1051, 493.827] - - [8, 100, 1, 512] - - [1049, 49.7087] + - [1052, 49.7087] - - [500, 2048, 1, 1024] - - [1151, 7651.61] + - [1154, 7651.61] - - [500, 2000, 1, 10] - - [1139, 1008.06] + - [1142, 1008.06] - - [32, 100, 1, 500] - - [1113, 186.916] + - [1116, 186.916] - - [256, 1024, 1, 2048] - - [1154, 6190.85] + - [1157, 6190.85] - - [32, 500, 1, 2048] - - [1049, 1083.6] + - [1052, 1083.6] - - [4, 2000, 1, 10] - - [1092, 17.5439] + - [1095, 17.5439] - - [128, 500, 1, 2000] - - [1109, 3516.48] + - [1112, 3516.48] - - [8, 1024, 1, 10] - - [1083, 17.9649] + - [1086, 17.9649] - - [2, 500, 1, 100] - - [1028, 16.0256] + - [1031, 16.0256] - - [10, 500, 1, 512] - - [1049, 290.909] + - [1052, 290.909] - - [10, 2000, 1, 10] - - [1027, 38.4615] + - [1030, 38.4615] - - [500, 512, 1, 512] - - [1144, 5893.53] + - [1147, 5893.53] - - [32, 500, 1, 500] - - [1049, 892.857] + - [1052, 892.857] - - [256, 500, 1, 2000] - - [1158, 6237.82] + - [1161, 6237.82] - - [100, 500, 1, 100] - - [1060, 726.744] + - [1063, 726.744] - - [500, 2048, 1, 100] - - [1133, 4866.92] + - [1136, 4866.92] - - [10, 1024, 1, 512] - - [1049, 520.127] + - [1052, 520.127] - - [2, 2048, 1, 512] - - [1059, 151.528] + - [1062, 151.528] - - [256, 512, 1, 100] - - [1138, 1590.68] + - [1141, 1590.68] - - [10, 2048, 1, 100] - - [1049, 324.051] + - [1052, 324.051] - - [8, 2048, 1, 100] - - [1104, 256.0] + - [1107, 256.0] - - [512, 100, 1, 512] - - [1200, 2100.51] + - [1203, 2100.51] - - [4, 500, 1, 500] - - [1049, 115.741] + - [1052, 115.741] - - [64, 100, 1, 1024] - - [1049, 450.11] + - [1052, 450.11] - - [2, 2048, 1, 1024] - - [1106, 137.608] + - [1109, 137.608] - - [2, 500, 1, 2000] - - [1075, 90.2527] + - [1078, 90.2527] - - [512, 1024, 1, 500] - - [1180, 6898.53] + - [1183, 6898.53] - - [128, 2000, 1, 500] - - [1185, 5161.29] + - [1188, 5161.29] - - [32, 512, 1, 2048] - - [1119, 1103.76] + - [1122, 1103.76] - - [10, 100, 1, 2000] - - [1049, 105.932] + - [1052, 105.932] - - [4, 100, 1, 512] - - [1049, 24.6154] + - [1052, 24.6154] - - [2, 512, 1, 2048] - - [1113, 73.2246] + - [1116, 73.2246] - - [200, 512, 1, 2048] - - [1225, 3953.91] + - [1228, 3953.91] - - [200, 2000, 1, 2000] - - [1187, 6230.53] + - [1190, 6230.53] - - [100, 100, 1, 2000] - - [1049, 827.815] + - [1052, 827.815] - - [500, 2048, 1, 2000] - - [1150, 8387.94] + - [1153, 8387.94] - - [64, 2048, 1, 2048] - - [1195, 3406.54] + - [1198, 3406.54] - - [16, 2000, 1, 1024] - - [1055, 1024.0] + - [1058, 1024.0] - - [512, 2048, 1, 1024] - - [1128, 8061.12] + - [1131, 8061.12] - - [10, 500, 1, 500] - - [1059, 284.091] + - [1062, 284.091] - - [200, 1024, 1, 2048] - - [1223, 4886.19] + - [1226, 4886.19] - - [10, 2000, 1, 2000] - - [1049, 1449.28] + - [1052, 1449.28] - - [8, 2000, 1, 500] - - [1098, 719.424] + - [1101, 719.424] - - [2, 100, 1, 2048] - - [1113, 19.845] + - [1116, 19.845] - - [32, 100, 1, 2048] - - [1113, 323.794] + - [1116, 323.794] - - [512, 512, 1, 10] - - [1170, 420.103] + - [1173, 420.103] - - [512, 500, 1, 10] - - [1175, 376.471] + - [1178, 376.471] - - [16, 100, 1, 1024] - - [1059, 129.62] + - [1062, 129.62] - - [2, 500, 1, 10] - - [1023, 2.11864] + - [1026, 2.11864] - - [200, 512, 1, 10] - - [1025, 188.235] + - [1028, 188.235] - - [512, 1024, 1, 100] - - [1125, 3877.87] + - [1128, 3877.87] - - [16, 2000, 1, 2000] - - [1049, 2222.22] + - [1052, 2222.22] - - [500, 500, 1, 1024] - - [1145, 6130.27] + - [1148, 6130.27] - - [500, 100, 1, 2048] - - [1200, 2949.31] + - [1203, 2949.31] - - [256, 1024, 1, 512] - - [1164, 5886.74] + - [1167, 5886.74] - - [256, 500, 1, 512] - - [1142, 4380.75] + - [1145, 4380.75] - - [16, 1024, 1, 2000] - - [1113, 1208.26] + - [1116, 1208.26] - - [200, 500, 1, 2048] - - [1225, 3855.42] + - [1228, 3855.42] - - [256, 2000, 1, 10] - - [1127, 727.273] + - [1130, 727.273] - - [10, 2048, 1, 2048] - - [1080, 823.058] + - [1083, 823.058] - - [512, 2000, 1, 100] - - [1129, 5120.0] + - [1132, 5120.0] - - [10, 1024, 1, 1024] - - [1056, 553.046] + - [1059, 553.046] - - [512, 2000, 1, 2048] - - [1157, 7563.3] + - [1160, 7563.3] - - [500, 1024, 1, 500] - - [1181, 6570.84] + - [1184, 6570.84] - - [500, 100, 1, 512] - - [1200, 2038.22] + - [1203, 2038.22] - - [256, 2000, 1, 100] - - [1149, 3764.71] + - [1152, 3764.71] - - [512, 1024, 1, 2048] - - [1193, 7286.52] + - [1196, 7286.52] - - [32, 512, 1, 500] - - [1049, 898.246] + - [1052, 898.246] - - [100, 2000, 1, 10] - - [1041, 333.333] + - [1044, 333.333] - - [100, 500, 1, 512] - - [1219, 2176.87] + - [1222, 2176.87] - - [8, 2000, 1, 512] - - [1098, 602.353] + - [1101, 602.353] - - [100, 2048, 1, 2048] - - [1205, 3694.77] + - [1208, 3694.77] - - [128, 1024, 1, 2048] - - [1224, 4168.25] + - [1227, 4168.25] - - [8, 500, 1, 2000] - - [1123, 352.113] + - [1126, 352.113] - - [100, 2000, 1, 500] - - [1173, 4045.31] + - [1176, 4045.31] - - [100, 2048, 1, 100] - - [1173, 2081.3] + - [1176, 2081.3] - - [4, 100, 1, 1024] - - [1049, 33.0323] + - [1052, 33.0323] - - [500, 2048, 1, 2048] - - [1157, 7764.93] + - [1160, 7764.93] - - [2, 2000, 1, 2048] - - [1068, 166.234] + - [1071, 166.234] - - [200, 2048, 1, 10] - - [1042, 609.524] + - [1045, 609.524] - - [2, 500, 1, 1024] - - [1113, 75.2941] + - [1116, 75.2941] - - [100, 500, 1, 1024] - - [1109, 1975.31] + - [1112, 1975.31] - - [16, 2048, 1, 500] - - [1049, 1473.38] + - [1052, 1473.38] - - [100, 1024, 1, 10] - - [1209, 185.507] + - [1212, 185.507] - - [8, 2048, 1, 1024] - - [1105, 543.304] + - [1108, 543.304] - - [2, 2000, 1, 500] - - [1049, 179.856] + - [1052, 179.856] - - [32, 100, 1, 1024] - - [1049, 267.712] + - [1052, 267.712] - - [500, 2000, 1, 512] - - [1179, 7087.49] + - [1182, 7087.49] - - [64, 100, 1, 2000] - - [1059, 615.385] + - [1062, 615.385] - - [100, 1024, 1, 2000] - - [1222, 4224.42] + - [1225, 4224.42] - - [64, 500, 1, 10] - - [1024, 63.4921] + - [1027, 63.4921] - - [32, 2048, 1, 100] - - [1045, 941.609] + - [1048, 941.609] - - [64, 500, 1, 512] - - [1049, 1575.38] + - [1052, 1575.38] - - [10, 100, 1, 1024] - - [1059, 82.5806] + - [1062, 82.5806] - - [16, 512, 1, 100] - - [1048, 148.406] + - [1051, 148.406] - - [4, 100, 1, 2000] - - [1122, 43.8597] + - [1125, 43.8597] - - [2, 512, 1, 1024] - - [1113, 74.052] + - [1116, 74.052] - - [64, 512, 1, 1024] - - [1114, 1570.9] + - [1117, 1570.9] - - [10, 2048, 1, 500] - - [1049, 920.863] + - [1052, 920.863] - - [4, 2000, 1, 2048] - - [1068, 326.115] + - [1071, 326.115] - - [512, 100, 1, 2048] - - [1203, 3084.05] + - [1206, 3084.05] - - [32, 100, 1, 2000] - - [1049, 343.348] + - [1052, 343.348] - - [256, 512, 1, 500] - - [1142, 4311.58] + - [1145, 4311.58] - - [100, 2000, 1, 100] - - [1173, 2016.13] + - [1176, 2016.13] - - [8, 2000, 1, 1024] - - [1062, 544.681] + - [1065, 544.681] - - [4, 512, 1, 500] - - [1049, 118.519] + - [1052, 118.519] - - [128, 1024, 1, 10] - - [1212, 244.537] + - [1215, 244.537] - - [4, 500, 1, 1024] - - [1049, 144.633] + - [1052, 144.633] - - [32, 2048, 1, 512] - - [1052, 2139.95] + - [1055, 2139.95] - - [32, 100, 1, 10] - - [1027, 7.01754] + - [1030, 7.01754] - - [100, 2048, 1, 10] - - [1216, 341.333] + - [1219, 341.333] - - [512, 500, 1, 100] - - [1177, 2461.54] + - [1180, 2461.54] - - [128, 2000, 1, 1024] - - [1161, 4174.27] + - [1164, 4174.27] - - [200, 1024, 1, 500] - - [1173, 4295.3] + - [1176, 4295.3] - - [32, 2048, 1, 1024] - - [1076, 1667.72] + - [1079, 1667.72] - - [10, 1024, 1, 2048] - - [1067, 555.39] + - [1070, 555.39] - - [8, 500, 1, 100] - - [1048, 71.4286] + - [1051, 71.4286] - - [32, 2048, 1, 500] - - [1052, 2528.4] + - [1055, 2528.4] - - [200, 100, 1, 1024] - - [1061, 1071.13] + - [1064, 1071.13] - - [16, 100, 1, 100] - - [1038, 28.5714] + - [1041, 28.5714] - - [8, 1024, 1, 2000] - - [1122, 654.313] + - [1125, 654.313] - - [4, 512, 1, 100] - - [1048, 36.5714] + - [1051, 36.5714] - - [16, 500, 1, 100] - - [1048, 142.857] + - [1051, 142.857] - - [8, 1024, 1, 2048] - - [1074, 441.506] + - [1077, 441.506] - - [16, 1024, 1, 2048] - - [1075, 886.745] + - [1078, 886.745] - - [10, 2048, 1, 1024] - - [1053, 639.376] + - [1056, 639.376] - - [64, 512, 1, 100] - - [1048, 518.481] + - [1051, 518.481] - - [2, 100, 1, 500] - - [1049, 9.61538] + - [1052, 9.61538] - - [2, 500, 1, 512] - - [1055, 48.1203] + - [1058, 48.1203] - - [256, 512, 1, 2000] - - [1158, 6450.39] + - [1161, 6450.39] - - [128, 500, 1, 1024] - - [1052, 2497.56] + - [1055, 2497.56] - - [10, 100, 1, 10] - - [1089, 2.23214] + - [1092, 2.23214] - - [8, 2048, 1, 2048] - - [1039, 643.298] + - [1042, 643.298] - - [16, 2048, 1, 2048] - - [1079, 1337.9] + - [1082, 1337.9] - - [64, 1024, 1, 10] - - [1042, 132.129] + - [1045, 132.129] - - [500, 100, 1, 500] - - [1200, 1940.99] + - [1203, 1940.99] - - [256, 1024, 1, 2000] - - [1196, 7629.34] + - [1199, 7629.34] - - [200, 512, 1, 500] - - [1185, 3232.32] + - [1188, 3232.32] - - [8, 2000, 1, 10] - - [1086, 32.2581] + - [1089, 32.2581] - - [64, 2000, 1, 512] - - [1184, 3225.2] + - [1187, 3225.2] - - [2, 512, 1, 100] - - [1028, 16.6234] + - [1031, 16.6234] - - [4, 2000, 1, 2000] - - [1049, 586.51] + - [1052, 586.51] - - [200, 1024, 1, 100] - - [1173, 2133.33] + - [1176, 2133.33] - - [16, 100, 1, 500] - - [1113, 92.5926] + - [1116, 92.5926] - - [128, 100, 1, 500] - - [1109, 526.316] + - [1112, 526.316] - - [500, 1024, 1, 1024] - - [1143, 7201.76] + - [1146, 7201.76] - - [200, 1024, 1, 1024] - - [1195, 4519.72] + - [1198, 4519.72] - - [8, 2048, 1, 512] - - [1059, 624.152] + - [1062, 624.152] - - [200, 2000, 1, 500] - - [1149, 5186.72] + - [1152, 5186.72] - - [512, 100, 1, 1024] - - [1200, 2742.09] + - [1203, 2742.09] - - [16, 100, 1, 2000] - - [1059, 168.776] + - [1062, 168.776] - - [500, 512, 1, 2000] - - [1196, 7289.29] + - [1199, 7289.29] - - [8, 2000, 1, 2048] - - [1070, 668.189] + - [1073, 668.189] - - [256, 2048, 1, 100] - - [1131, 3924.31] + - [1134, 3924.31] - - [32, 2048, 1, 2000] - - [1063, 3882.46] + - [1066, 3882.46] - - [200, 500, 1, 512] - - [1188, 3368.42] + - [1191, 3368.42] - - [10, 512, 1, 100] - - [1048, 91.4286] + - [1051, 91.4286] - - [16, 2000, 1, 10] - - [1026, 61.5385] + - [1029, 61.5385] - - [8, 512, 1, 100] - - [1048, 72.1127] + - [1051, 72.1127] - - [256, 512, 1, 512] - - [1153, 4583.94] + - [1156, 4583.94] - - [500, 2000, 1, 1024] - - [1128, 7569.49] + - [1131, 7569.49] - - [512, 512, 1, 500] - - [1144, 5708.71] + - [1147, 5708.71] - - [256, 2048, 1, 1024] - - [1168, 5923.11] + - [1171, 5923.11] - - [8, 2048, 1, 2000] - - [1049, 1153.8] + - [1052, 1153.8] - - [100, 512, 1, 2048] - - [1115, 2383.13] + - [1118, 2383.13] - - [100, 1024, 1, 512] - - [1200, 3343.67] + - [1203, 3343.67] - - [128, 100, 1, 2000] - - [1218, 1084.75] + - [1221, 1084.75] - - [4, 2048, 1, 2048] - - [1067, 332.354] + - [1070, 332.354] - - [2, 1024, 1, 2000] - - [1078, 161.006] + - [1081, 161.006] - - [100, 512, 1, 512] - - [1052, 2184.53] + - [1055, 2184.53] - - [128, 1024, 1, 1024] - - [1195, 3847.99] + - [1198, 3847.99] - - [200, 2048, 1, 1024] - - [1130, 4547.16] + - [1133, 4547.16] - - [32, 1024, 1, 2000] - - [1059, 2416.52] + - [1062, 2416.52] - - [128, 500, 1, 100] - - [1054, 919.54] + - [1057, 919.54] - - [200, 512, 1, 2000] - - [1222, 4238.41] + - [1225, 4238.41] - - [10, 2048, 1, 2000] - - [1059, 1454.55] + - [1062, 1454.55] - - [256, 1024, 1, 500] - - [1156, 5669.2] + - [1159, 5669.2] - - [100, 100, 1, 100] - - [1048, 171.233] + - [1051, 171.233] - - [8, 512, 1, 1024] - - [1117, 286.496] + - [1120, 286.496] - - [200, 1024, 1, 512] - - [1173, 4354.55] + - [1176, 4354.55] - - [256, 500, 1, 500] - - [1158, 4020.1] + - [1161, 4020.1] - - [200, 100, 1, 500] - - [1222, 702.247] + - [1225, 702.247] - - [2, 1024, 1, 2048] - - [1068, 112.75] + - [1071, 112.75] - - [256, 500, 1, 2048] - - [1158, 5041.23] + - [1161, 5041.23] - - [512, 2048, 1, 500] - - [1151, 7710.12] + - [1154, 7710.12] - - [512, 100, 1, 2000] - - [1200, 3099.27] + - [1203, 3099.27] - - [512, 500, 1, 1024] - - [1159, 6463.12] + - [1162, 6463.12] - - [16, 512, 1, 2000] - - [1075, 721.127] + - [1078, 721.127] - - [64, 500, 1, 1024] - - [1114, 1528.36] + - [1117, 1528.36] - - [512, 2000, 1, 10] - - [1135, 1174.31] + - [1138, 1174.31] - - [256, 512, 1, 1024] - - [1153, 4978.4] + - [1156, 4978.4] - - [10, 512, 1, 1024] - - [1113, 370.26] + - [1116, 370.26] - - [512, 100, 1, 100] - - [1136, 659.794] + - [1139, 659.794] - - [8, 2000, 1, 100] - - [1048, 256.41] + - [1051, 256.41] - - [128, 2048, 1, 1024] - - [1161, 4173.44] + - [1164, 4173.44] - - [2, 2000, 1, 2000] - - [1049, 250.627] + - [1052, 250.627] - - [16, 2048, 1, 1024] - - [1096, 1045.96] + - [1099, 1045.96] - - [500, 512, 1, 500] - - [1141, 5517.24] + - [1144, 5517.24] - - [8, 100, 1, 1024] - - [1114, 64.0] + - [1117, 64.0] - - [10, 100, 1, 100] - - [1038, 17.8571] + - [1041, 17.8571] - - [200, 500, 1, 500] - - [1188, 3140.7] + - [1191, 3140.7] - - [10, 500, 1, 2000] - - [1075, 444.84] + - [1078, 444.84] - - [500, 100, 1, 2000] - - [1203, 2969.12] + - [1206, 2969.12] - - [100, 512, 1, 2000] - - [1115, 2776.57] + - [1118, 2776.57] - - [500, 1024, 1, 2000] - - [1194, 8020.05] + - [1197, 8020.05] - - [32, 2000, 1, 2000] - - [1055, 3827.75] + - [1058, 3827.75] - - [64, 1024, 1, 512] - - [1219, 2573.19] + - [1222, 2573.19] - - [64, 2000, 1, 2000] - - [1188, 5797.1] + - [1191, 5797.1] - - [32, 500, 1, 100] - - [1048, 266.667] + - [1051, 266.667] - - [128, 2000, 1, 2048] - - [1204, 4547.95] + - [1207, 4547.95] - - [10, 100, 1, 2048] - - [1113, 98.4615] + - [1116, 98.4615] - - [32, 2048, 1, 2048] - - [1076, 2213.35] + - [1079, 2213.35] - - [64, 100, 1, 100] - - [1049, 96.3855] + - [1052, 96.3855] - - [2, 1024, 1, 100] - - [1099, 34.5946] + - [1102, 34.5946] - - [256, 1024, 1, 10] - - [1169, 425.558] + - [1172, 425.558] - - [256, 1024, 1, 1024] - - [1162, 5482.75] + - [1165, 5482.75] - - [64, 500, 1, 2000] - - [1049, 2056.56] + - [1052, 2056.56] - - [512, 2000, 1, 512] - - [1147, 7550.23] + - [1150, 7550.23] - - [8, 512, 1, 512] - - [1056, 231.986] + - [1059, 231.986] - - [8, 512, 1, 2048] - - [1049, 290.464] + - [1052, 290.464] - - [100, 100, 1, 1024] - - [1219, 624.39] + - [1222, 624.39] - - [2, 2048, 1, 10] - - [1092, 8.82759] + - [1095, 8.82759] - - [4, 2048, 1, 512] - - [1098, 312.076] + - [1101, 312.076] - - [4, 2048, 1, 10] - - [1091, 17.9649] + - [1094, 17.9649] - - [8, 100, 1, 2000] - - [1068, 85.8369] + - [1071, 85.8369] - - [2, 1024, 1, 1024] - - [1065, 101.214] + - [1068, 101.214] - - [16, 2048, 1, 100] - - [1049, 518.481] + - [1052, 518.481] - - [16, 512, 1, 512] - - [1059, 455.903] + - [1062, 455.903] - - [32, 500, 1, 512] - - [1056, 906.195] + - [1059, 906.195] - - [500, 2000, 1, 2000] - - [1151, 8143.32] + - [1154, 8143.32] - - [500, 1024, 1, 10] - - [1132, 680.851] + - [1135, 680.851] - - [32, 500, 1, 1024] - - [1108, 1008.87] + - [1111, 1008.87] - - [32, 500, 1, 10] - - [1044, 33.3333] + - [1047, 33.3333] - - [500, 500, 1, 10] - - [1173, 367.647] + - [1176, 367.647] - - [4, 2000, 1, 500] - - [1059, 370.37] + - [1062, 370.37] - - [10, 2000, 1, 500] - - [1049, 899.281] + - [1052, 899.281] - - [32, 2000, 1, 512] - - [1061, 2089.8] + - [1064, 2089.8] - - [256, 500, 1, 100] - - [1174, 1495.33] + - [1177, 1495.33] - - [256, 2048, 1, 10] - - [1132, 789.59] + - [1135, 789.59] - - [4, 1024, 1, 500] - - [1049, 222.609] + - [1052, 222.609] - - [256, 512, 1, 2048] - - [1158, 5292.5] + - [1161, 5292.5] - - [2, 2000, 1, 1024] - - [1096, 137.265] + - [1099, 137.265] - - [256, 100, 1, 512] - - [1200, 1085.03] + - [1203, 1085.03] - - [8, 1024, 1, 500] - - [1049, 441.379] + - [1052, 441.379] - - [256, 2048, 1, 500] - - [1179, 7031.76] + - [1182, 7031.76] - - [256, 2048, 1, 2048] - - [1142, 6771.83] + - [1145, 6771.83] - - [2, 2000, 1, 512] - - [1103, 159.006] + - [1106, 159.006] - - [256, 2000, 1, 512] - - [1146, 6527.49] + - [1149, 6527.49] - - [4, 1024, 1, 100] - - [1095, 70.137] + - [1098, 70.137] - - [512, 1024, 1, 2000] - - [1180, 8295.7] + - [1183, 8295.7] - - [100, 500, 1, 500] - - [1052, 2016.13] + - [1055, 2016.13] - - [4, 2048, 1, 1024] - - [1100, 284.939] + - [1103, 284.939] - - [2, 1024, 1, 500] - - [1049, 109.402] + - [1052, 109.402] - - [64, 100, 1, 500] - - [1049, 296.296] + - [1052, 296.296] - - [256, 2000, 1, 2000] - - [1157, 8152.87] + - [1160, 8152.87] - - [2, 512, 1, 500] - - [1055, 44.7552] + - [1058, 44.7552] - - [8, 2048, 1, 500] - - [1049, 736.691] + - [1052, 736.691] - - [10, 1024, 1, 500] - - [1049, 547.009] + - [1052, 547.009] - - [4, 2048, 1, 2000] - - [1059, 604.13] + - [1062, 604.13] - - [200, 1024, 1, 2000] - - [1226, 5400.84] + - [1229, 5400.84] - - [128, 500, 1, 512] - - [1219, 2730.67] + - [1222, 2730.67] - - [10, 500, 1, 2048] - - [1113, 359.551] + - [1116, 359.551] - - [256, 2048, 1, 2000] - - [1157, 8375.21] + - [1160, 8375.21] - - [8, 2000, 1, 2000] - - [1059, 1146.13] + - [1062, 1146.13] - - [100, 2048, 1, 512] - - [1182, 3936.1] + - [1185, 3936.1] - - [512, 500, 1, 2048] - - [1158, 6756.29] + - [1161, 6756.29] - - [200, 2048, 1, 100] - - [1149, 3180.12] + - [1152, 3180.12] - - [128, 512, 1, 512] - - [1052, 2872.81] + - [1055, 2872.81] - - [200, 2000, 1, 2048] - - [1198, 4818.82] + - [1201, 4818.82] - - [4, 2000, 1, 1024] - - [1096, 275.269] + - [1099, 275.269] - - [64, 512, 1, 10] - - [1211, 69.4237] + - [1214, 69.4237] - - [32, 500, 1, 2000] - - [1078, 1246.11] + - [1081, 1246.11] - - [128, 2048, 1, 2000] - - [1191, 7233.55] + - [1194, 7233.55] - - [100, 100, 1, 2048] - - [1049, 790.123] + - [1052, 790.123] - - [500, 2048, 1, 512] - - [1179, 7249.56] + - [1182, 7249.56] - - [200, 100, 1, 512] - - [1055, 748.538] + - [1058, 748.538] - - [32, 2000, 1, 100] - - [1050, 930.233] + - [1053, 930.233] - - [500, 512, 1, 2048] - - [1201, 6639.92] + - [1204, 6639.92] - - [500, 2000, 1, 500] - - [1181, 7078.14] + - [1184, 7078.14] - - [200, 100, 1, 2048] - - [1059, 1387.53] + - [1062, 1387.53] - - [2, 2048, 1, 100] - - [1093, 64.8101] + - [1096, 64.8101] - - [8, 100, 1, 10] - - [1034, 1.75439] + - [1037, 1.75439] - - [200, 2048, 1, 2048] - - [1198, 5021.92] + - [1201, 5021.92] - - [200, 2048, 1, 500] - - [1149, 5355.65] + - [1152, 5355.65] - - [100, 100, 1, 500] - - [1219, 416.667] + - [1222, 416.667] - - [8, 2048, 1, 10] - - [1090, 34.7119] + - [1093, 34.7119] - - [100, 500, 1, 10] - - [1030, 93.2836] + - [1033, 93.2836] - - [200, 500, 1, 2000] - - [1222, 4152.82] + - [1225, 4152.82] - - [512, 2000, 1, 500] - - [1151, 7485.38] + - [1154, 7485.38] - - [10, 500, 1, 1024] - - [1117, 363.636] + - [1120, 363.636] - - [256, 100, 1, 10] - - [1166, 41.0256] + - [1169, 41.0256] - - [500, 512, 1, 1024] - - [1145, 6362.72] + - [1148, 6362.72] - - [200, 2048, 1, 2000] - - [1187, 6320.99] + - [1190, 6320.99] - - [100, 1024, 1, 100] - - [1186, 1306.12] + - [1189, 1306.12] - - [500, 1024, 1, 100] - - [1125, 3699.42] + - [1128, 3699.42] - - [10, 512, 1, 2048] - - [1049, 361.08] + - [1052, 361.08] - - [2, 1024, 1, 512] - - [1098, 105.703] + - [1101, 105.703] - - [4, 500, 1, 2048] - - [1121, 143.417] + - [1124, 143.417] - - [100, 512, 1, 100] - - [1054, 744.186] + - [1057, 744.186] - - [16, 500, 1, 512] - - [1049, 453.097] + - [1052, 453.097] - - [10, 1024, 1, 100] - - [1047, 166.234] + - [1050, 166.234] - - [8, 1024, 1, 100] - - [1095, 140.274] + - [1098, 140.274] - - [64, 2000, 1, 500] - - [1190, 3940.89] + - [1193, 3940.89] - - [64, 1024, 1, 2000] - - [1055, 3531.03] + - [1058, 3531.03] - - [10, 100, 1, 512] - - [1049, 61.5385] + - [1052, 61.5385] - - [4, 500, 1, 2000] - - [1075, 173.01] + - [1078, 173.01] - - [512, 1024, 1, 10] - - [1126, 736.36] + - [1129, 736.36] - - [128, 2048, 1, 2048] - - [1189, 4596.5] + - [1192, 4596.5] - - [4, 100, 1, 100] - - [1038, 7.14286] + - [1041, 7.14286] - - [32, 1024, 1, 512] - - [1098, 1519.68] + - [1101, 1519.68] - - [8, 512, 1, 2000] - - [1123, 356.794] + - [1126, 356.794] - - [100, 100, 1, 512] - - [1063, 426.667] + - [1066, 426.667] - - [2, 2048, 1, 2048] - - [1072, 170.778] + - [1075, 170.778] - - [2, 512, 1, 2000] - - [1075, 90.7801] + - [1078, 90.7801] - - [16, 500, 1, 10] - - [1048, 18.1818] + - [1051, 18.1818] - - [10, 500, 1, 100] - - [1048, 88.0282] + - [1051, 88.0282] - - [4, 100, 1, 500] - - [1113, 23.5849] + - [1116, 23.5849] - - [512, 1024, 1, 1024] - - [1165, 7431.77] + - [1168, 7431.77] - - [64, 500, 1, 100] - - [1058, 506.329] + - [1061, 506.329] - - [128, 2000, 1, 10] - - [1216, 432.432] + - [1219, 432.432] - - [10, 2000, 1, 2048] - - [1079, 806.299] + - [1082, 806.299] - - [2, 100, 1, 100] - - [1036, 3.125] + - [1039, 3.125] - - [10, 512, 1, 2000] - - [1068, 462.094] + - [1071, 462.094] - - [8, 500, 1, 500] - - [1049, 231.481] + - [1052, 231.481] - - [4, 500, 1, 512] - - [1049, 118.519] + - [1052, 118.519] - - [10, 500, 1, 10] - - [1043, 10.9649] + - [1046, 10.9649] - - [64, 512, 1, 2000] - - [1049, 2116.8] + - [1052, 2116.8] - - [500, 512, 1, 10] - - [1170, 395.062] + - [1173, 395.062] - - [200, 512, 1, 512] - - [1188, 3449.26] + - [1191, 3449.26] - - [512, 500, 1, 500] - - [1144, 5536.33] + - [1147, 5536.33] - - [32, 512, 1, 2000] - - [1059, 1264.2] + - [1062, 1264.2] - - [128, 500, 1, 2048] - - [1115, 3006.24] + - [1118, 3006.24] - - [500, 2048, 1, 10] - - [1140, 1049.18] + - [1143, 1049.18] - - [512, 512, 1, 100] - - [1177, 2664.06] + - [1180, 2664.06] - - [200, 2000, 1, 512] - - [1185, 5192.7] + - [1188, 5192.7] - - [500, 500, 1, 512] - - [1141, 5673.76] + - [1144, 5673.76] - - [128, 2048, 1, 500] - - [1173, 5251.28] + - [1176, 5251.28] - - [4, 512, 1, 512] - - [1049, 123.653] + - [1052, 123.653] - - [16, 2048, 1, 2000] - - [1065, 2294.68] + - [1068, 2294.68] - - [16, 500, 1, 1024] - - [1049, 562.637] + - [1052, 562.637] - - [256, 2000, 1, 500] - - [1179, 6639.0] + - [1182, 6639.0] - - [10, 1024, 1, 10] - - [1029, 20.9836] + - [1032, 20.9836] - - [16, 500, 1, 500] - - [1049, 446.429] + - [1052, 446.429] - - [10, 2048, 1, 512] - - [1047, 784.862] + - [1050, 784.862] - - [200, 500, 1, 10] - - [1022, 176.056] + - [1025, 176.056] - - [256, 2048, 1, 512] - - [1176, 6540.83] + - [1179, 6540.83] - - [256, 2000, 1, 2048] - - [1153, 6670.33] + - [1156, 6670.33] - - [500, 2048, 1, 500] - - [1181, 7264.47] + - [1184, 7264.47] - - [500, 100, 1, 1024] - - [1203, 2700.42] + - [1206, 2700.42] - - [16, 100, 1, 512] - - [1113, 96.6038] + - [1116, 96.6038] - - [64, 512, 1, 2048] - - [1114, 1868.29] + - [1117, 1868.29] - - [32, 1024, 1, 10] - - [1025, 69.4237] + - [1028, 69.4237] - - [16, 2048, 1, 512] - - [1098, 1226.4] + - [1101, 1226.4] - - [8, 1024, 1, 512] - - [1098, 416.102] + - [1101, 416.102] - - [4, 1024, 1, 2048] - - [1120, 223.101] + - [1123, 223.101] - - [100, 2048, 1, 2000] - - [1193, 5614.04] + - [1196, 5614.04] - - [512, 512, 1, 2048] - - [1158, 6868.87] + - [1161, 6868.87] - - [256, 2000, 1, 1024] - - [1149, 5758.88] + - [1152, 5758.88] - - [64, 512, 1, 512] - - [1218, 1651.3] + - [1221, 1651.3] - - [200, 1024, 1, 10] - - [1032, 341.333] + - [1035, 341.333] - - [128, 500, 1, 500] - - [1061, 2580.65] + - [1064, 2580.65] - - [100, 512, 1, 1024] - - [1052, 2041.62] + - [1055, 2041.62] - - [16, 1024, 1, 500] - - [1049, 867.797] + - [1052, 867.797] - - [128, 100, 1, 2048] - - [1219, 1011.36] + - [1222, 1011.36] - - [100, 512, 1, 500] - - [1052, 2051.28] + - [1055, 2051.28] - - [8, 1024, 1, 1024] - - [1065, 424.525] + - [1068, 424.525] - - [2, 2000, 1, 10] - - [1091, 8.47458] + - [1094, 8.47458] - - [4, 500, 1, 10] - - [1088, 4.46429] + - [1091, 4.46429] - - [500, 2000, 1, 2048] - - [1165, 7444.02] + - [1168, 7444.02] - - [4, 2000, 1, 100] - - [1101, 128.205] + - [1104, 128.205] - - [512, 2000, 1, 2000] - - [1151, 8454.43] + - [1154, 8454.43] - - [128, 500, 1, 10] - - [1210, 117.647] + - [1213, 117.647] - - [32, 1024, 1, 100] - - [1058, 512.0] + - [1061, 512.0] - - [8, 500, 1, 2048] - - [1073, 286.835] + - [1076, 286.835] - - [16, 1024, 1, 1024] - - [1037, 881.156] + - [1040, 881.156] - - [200, 100, 1, 10] - - [1209, 40.3226] + - [1212, 40.3226] - - [512, 100, 1, 500] - - [1203, 1987.58] + - [1206, 1987.58] - - [512, 2048, 1, 2048] - - [1160, 8063.55] + - [1163, 8063.55] - - [16, 2000, 1, 512] - - [1059, 1204.71] + - [1062, 1204.71] - - [64, 2048, 1, 1024] - - [1057, 2853.27] + - [1060, 2853.27] - - [32, 2048, 1, 10] - - [1031, 130.032] + - [1034, 130.032] - - [10, 2048, 1, 10] - - [1033, 39.3846] + - [1036, 39.3846] - - [4, 2000, 1, 512] - - [1049, 316.049] + - [1052, 316.049] - - [4, 500, 1, 100] - - [1048, 35.7143] + - [1051, 35.7143] - - [8, 100, 1, 2048] - - [1068, 84.6281] + - [1071, 84.6281] - - [512, 2048, 1, 10] - - [1148, 1224.97] + - [1151, 1224.97] - - [512, 100, 1, 10] - - [1137, 90.1408] + - [1140, 90.1408] - - [4, 512, 1, 1024] - - [1049, 143.248] + - [1052, 143.248] - - [16, 2048, 1, 10] - - [1082, 65.0159] + - [1085, 65.0159] - - [500, 2000, 1, 100] - - [1133, 4716.98] + - [1136, 4716.98] - - [32, 1024, 1, 2048] - - [1076, 1582.76] + - [1079, 1582.76] - - [100, 2000, 1, 2000] - - [1193, 5512.68] + - [1196, 5512.68] - - [128, 100, 1, 512] - - [1219, 561.096] + - [1222, 561.096] - - [500, 500, 1, 100] - - [1173, 2460.63] + - [1176, 2460.63] - - [32, 2000, 1, 10] - - [1025, 119.403] + - [1028, 119.403] - - [128, 2048, 1, 100] - - [1173, 2708.1] + - [1176, 2708.1] - - [10, 2000, 1, 100] - - [1048, 316.456] + - [1051, 316.456] - - [2, 2048, 1, 500] - - [1059, 191.045] + - [1062, 191.045] - - [32, 1024, 1, 500] - - [1059, 1563.36] + - [1062, 1563.36] - - [4, 1024, 1, 10] - - [1088, 9.14286] + - [1091, 9.14286] - - [100, 512, 1, 10] - - [1214, 96.9697] + - [1217, 96.9697] - - [8, 100, 1, 100] - - [1064, 14.2857] + - [1067, 14.2857] - - [128, 512, 1, 500] - - [1052, 2677.12] + - [1055, 2677.12] - - [16, 100, 1, 2048] - - [1075, 161.897] + - [1078, 161.897] - - [2, 1024, 1, 10] - - [1088, 4.49123] + - [1091, 4.49123] - - [4, 100, 1, 2048] - - [1068, 41.7959] + - [1071, 41.7959] - - [4, 512, 1, 2000] - - [1068, 180.282] + - [1071, 180.282] - - [4096, 64, 1, 2048] - - [1268, 7247.18] + - [1271, 7247.18] - - [1024, 10080, 1, 1024] - - [1256, 9833.37] + - [1259, 9833.37] - - [1024, 1131, 1, 1024] - - [1234, 7551.85] + - [1237, 7551.85] - - [36548, 1216, 1, 1024] - - [1246, 10351.5] + - [1249, 10351.5] - - [1024, 29, 1, 1024] - - [1278, 1696.91] + - [1281, 1696.91] - - [1024, 2592, 1, 1024] - - [1247, 8424.01] + - [1250, 8424.01] - - [1024, 1568, 1, 1024] - - [1258, 7511.76] + - [1261, 7511.76] - - [4096, 91, 1, 2048] - - [1227, 5599.81] + - [1230, 5599.81] - - [1024, 4445, 1, 1024] - - [1245, 9261.12] + - [1248, 9261.12] - - [1024, 6272, 1, 1024] - - [1240, 9439.51] + - [1243, 9439.51] - - [36548, 3584, 1, 1024] - - [1239, 10393.7] + - [1242, 10393.7] - - [1024, 1827, 1, 1024] - - [1258, 8714.32] + - [1261, 8714.32] - - [1024, 3220, 1, 1024] - - [1238, 8861.1] + - [1241, 8861.1] - - [1024, 1856, 1, 1024] - - [1255, 8826.95] + - [1258, 8826.95] - - [1024, 1760, 1, 1024] - - [1255, 8334.1] + - [1258, 8334.1] - - [1024, 1600, 1, 1024] - - [1255, 7614.97] + - [1258, 7614.97] - - [1024, 1, 1, 21] - - [1259, 0.0] + - [1262, 0.0] - - [36548, 4235, 1, 1024] - - [1239, 10276.7] + - [1242, 10276.7] - - [1024, 49, 1, 1024] - - [1274, 2643.02] + - [1277, 2643.02] - - [1024, 1984, 1, 1024] - - [1258, 9449.42] + - [1261, 9449.42] - - [1024, 14720, 1, 1024] - - [1245, 10033.2] + - [1248, 10033.2] - - [1024, 1152, 1, 1024] - - [1228, 7523.44] + - [1231, 7523.44] - - [36548, 14976, 1, 1024] - - [1246, 10421.6] + - [1249, 10421.6] - - [36548, 1152, 1, 1024] - - [1246, 10258.0] + - [1249, 10258.0] - - [4096, 86, 1, 3072] - - [1227, 5308.75] + - [1230, 5308.75] - - [1024, 3392, 1, 1024] - - [1240, 9176.44] + - [1243, 9176.44] - - [1024, 1408, 1, 1024] - - [1240, 8958.73] + - [1243, 8958.73] - - [1024, 2080, 1, 1024] - - [1231, 8396.39] + - [1234, 8396.39] - - [1024, 1824, 1, 1024] - - [1249, 8671.61] + - [1252, 8671.61] - - [36548, 2432, 1, 1024] - - [1239, 10392.5] + - [1242, 10392.5] - - [4096, 29, 1, 2048] - - [1260, 4325.56] + - [1263, 4325.56] - - [1024, 1102, 1, 1024] - - [1234, 7204.08] + - [1237, 7204.08] - - [4096, 49, 1, 2048] - - [1266, 5609.19] + - [1269, 5609.19] - - [36548, 1827, 1, 1024] - - [1246, 10183.1] + - [1249, 10183.1] - - [4096, 25, 1, 2048] - - [1261, 3788.21] + - [1264, 3788.21] - - [1024, 10176, 1, 1024] - - [1256, 9941.08] + - [1259, 9941.08] - - [1024, 774, 1, 1024] - - [1241, 7079.57] + - [1244, 7079.57] - - [1024, 1952, 1, 1024] - - [1258, 9300.39] + - [1261, 9300.39] - - [4096, 128, 1, 2048] - - [1228, 8274.86] + - [1231, 8274.86] - - [1024, 17024, 1, 1024] - - [1238, 9960.62] + - [1241, 9960.62] - - [1024, 1472, 1, 1024] - - [1247, 9343.27] + - [1250, 9343.27] - - [36548, 4459, 1, 1024] - - [1239, 10358.0] + - [1242, 10358.0] - - [4096, 91, 1, 3072] - - [1233, 5509.29] + - [1236, 5509.29] - - [1024, 3712, 1, 1024] - - [1247, 9048.56] + - [1250, 9048.56] - - [4096, 64, 1, 3072] - - [1280, 7489.83] + - [1283, 7489.83] - - [4096, 29, 1, 3072] - - [1260, 4511.68] + - [1263, 4511.68] - - [4096, 128, 1, 3072] - - [1227, 8423.73] + - [1230, 8423.73] - - [36548, 12928, 1, 1024] - - [1246, 10426.0] + - [1249, 10426.0] - - [1024, 1632, 1, 1024] - - [1228, 7761.63] + - [1231, 7761.63] - - [1024, 1696, 1, 1024] - - [1253, 8107.19] + - [1256, 8107.19] - - [4096, 24, 1, 2048] - - [1260, 3663.15] + - [1263, 3663.15] - - [4096, 63, 1, 3072] - - [1269, 7175.27] + - [1272, 7175.27] - - [4096, 96, 1, 2048] - - [1228, 5866.18] + - [1231, 5866.18] - - [36548, 1764, 1, 1024] - - [1239, 10128.4] + - [1242, 10128.4] - - [4096, 32, 1, 2048] - - [1264, 4540.52] + - [1267, 4540.52] - - [1024, 35, 1, 1024] - - [1272, 1911.47] + - [1275, 1911.47] - - [1024, 1120, 1, 1024] - - [1227, 7289.03] + - [1230, 7289.03] - - [4096, 49, 1, 3072] - - [1266, 5751.52] + - [1269, 5751.52] - - [1024, 24, 1, 1024] - - [1272, 1391.92] + - [1275, 1391.92] - - [1024, 2944, 1, 1024] - - [1248, 9284.83] + - [1251, 9284.83] - - [36548, 14080, 1, 1024] - - [1239, 10441.3] + - [1242, 10441.3] - - [1024, 1, 1, 1024] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 1280, 1, 1024] - - [1227, 8244.36] + - [1230, 8244.36] - - [1024, 13440, 1, 1024] - - [1239, 9799.82] + - [1242, 9799.82] - - [1024, 1015, 1, 1024] - - [1247, 9187.75] + - [1250, 9187.75] - - [36548, 9120, 1, 1024] - - [1239, 10399.9] + - [1242, 10399.9] - - [36548, 1, 1, 1024] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 3008, 1, 1024] - - [1248, 9468.45] + - [1251, 9468.45] - - [1024, 2560, 1, 1024] - - [1245, 8879.21] + - [1248, 8879.21] - - [1024, 21, 1, 1024] - - [1271, 1234.31] + - [1274, 1234.31] - - [1024, 2208, 1, 1024] - - [1227, 8231.17] + - [1230, 8231.17] - - [1024, 96, 1, 1024] - - [1277, 3767.34] + - [1280, 3767.34] - - [4096, 86, 1, 2048] - - [1228, 5528.99] + - [1231, 5528.99] - - [4096, 96, 1, 3072] - - [1227, 6273.18] + - [1230, 6273.18] - - [1024, 1920, 1, 1024] - - [1257, 9118.09] + - [1260, 9118.09] - - [4096, 27, 1, 2048] - - [1260, 4073.6] + - [1263, 4073.6] - - [36548, 2496, 1, 1024] - - [1239, 10361.1] + - [1242, 10361.1] - - [1024, 1, 1, 14] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 91, 1, 1024] - - [1279, 3647.57] + - [1282, 3647.57] - - [1024, 2016, 1, 1024] - - [1255, 9560.14] + - [1258, 9560.14] - - [1024, 1184, 1, 1024] - - [1228, 7678.86] + - [1231, 7678.86] - - [4096, 1, 1, 2048] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 1664, 1, 1024] - - [1253, 7933.97] + - [1256, 7933.97] - - [1024, 11424, 1, 1024] - - [1245, 9777.81] + - [1248, 9777.81] - - [4096, 24, 1, 3072] - - [1263, 3813.0] + - [1266, 3813.0] - - [1024, 1216, 1, 1024] - - [1227, 7902.03] + - [1230, 7902.03] - - [36548, 3185, 1, 1024] - - [1239, 10336.6] + - [1242, 10336.6] - - [36548, 9216, 1, 1024] - - [1239, 10414.2] + - [1242, 10414.2] - - [1024, 3200, 1, 1024] - - [1245, 8846.91] + - [1248, 8846.91] - - [1024, 2656, 1, 1024] - - [1240, 8649.15] + - [1243, 8649.15] - - [1024, 2368, 1, 1024] - - [1240, 8873.06] + - [1243, 8873.06] - - [1024, 4459, 1, 1024] - - [1247, 9431.22] + - [1250, 9431.22] - - [1024, 3808, 1, 1024] - - [1247, 9263.62] + - [1250, 9263.62] - - [1024, 2336, 1, 1024] - - [1240, 8965.9] + - [1243, 8965.9] - - [4096, 27, 1, 3072] - - [1260, 4171.64] + - [1263, 4171.64] - - [1024, 2304, 1, 1024] - - [1237, 8601.28] + - [1240, 8601.28] - - [1024, 1560, 1, 1024] - - [1252, 7481.64] + - [1255, 7481.64] - - [4096, 35, 1, 3072] - - [1266, 4176.8] + - [1269, 4176.8] - - [1024, 2496, 1, 1024] - - [1243, 9092.76] + - [1246, 9092.76] - - [1024, 1504, 1, 1024] - - [1243, 9220.43] + - [1246, 9220.43] - - [4096, 50, 1, 2048] - - [1267, 5472.73] + - [1270, 5472.73] - - [1024, 3232, 1, 1024] - - [1240, 8961.84] + - [1243, 8961.84] - - [1024, 14, 1, 1024] - - [1271, 882.215] + - [1274, 882.215] - - [36548, 1015, 1, 1024] - - [1239, 10140.8] + - [1242, 10140.8] - - [1024, 2000, 1, 1024] - - [1251, 9487.7] + - [1254, 9487.7] - - [36548, 243, 1, 1024] - - [1244, 9441.02] + - [1247, 9441.02] - - [36548, 32, 1, 1024] - - [1232, 4720.95] + - [1235, 4720.95] - - [1024, 25, 1, 1024] - - [1278, 1462.86] + - [1281, 1462.86] - - [1024, 13184, 1, 1024] - - [1242, 9866.18] + - [1245, 9866.18] - - [1024, 2688, 1, 1024] - - [1237, 8559.83] + - [1240, 8559.83] - - [1024, 27, 1, 1024] - - [1276, 1559.01] + - [1279, 1559.01] - - [36548, 950, 1, 1024] - - [1246, 10053.5] + - [1249, 10053.5] - - [1024, 1764, 1, 1024] - - [1253, 8347.01] + - [1256, 8347.01] - - [1024, 992, 1, 1024] - - [1240, 9035.72] + - [1243, 9035.72] - - [1024, 1376, 1, 1024] - - [1240, 8797.86] + - [1243, 8797.86] - - [1024, 950, 1, 1024] - - [1247, 8635.16] + - [1250, 8635.16] - - [36548, 774, 1, 1024] - - [1239, 9460.72] + - [1242, 9460.72] - - [36548, 25, 1, 1024] - - [1232, 3694.06] + - [1235, 3694.06] - - [1024, 4256, 1, 1024] - - [1240, 9172.06] + - [1243, 9172.06] - - [4096, 32, 1, 3072] - - [1261, 4886.57] + - [1264, 4886.57] - - [1024, 243, 1, 1024] - - [1265, 6594.31] + - [1268, 6594.31] - - [36548, 3712, 1, 1024] - - [1239, 10401.5] + - [1242, 10401.5] - - [1024, 50, 1, 1024] - - [1274, 2742.09] + - [1277, 2742.09] - - [1024, 3360, 1, 1024] - - [1236, 9017.27] + - [1239, 9017.27] - - [1024, 2048, 1, 1024] - - [1251, 9736.55] + - [1254, 9736.55] - - [1024, 2784, 1, 1024] - - [1247, 8835.5] + - [1250, 8835.5] - - [1024, 4992, 1, 1024] - - [1245, 9639.28] + - [1248, 9639.28] - - [36548, 1102, 1, 1024] - - [1246, 9858.94] + - [1249, 9858.94] - - [1024, 1536, 1, 1024] - - [1238, 9294.88] + - [1241, 9294.88] - - [1024, 2720, 1, 1024] - - [1243, 8617.78] + - [1246, 8617.78] - - [4096, 1, 1, 3072] - - [1259, 0.0] + - [1262, 0.0] - - [1024, 2752, 1, 1024] - - [1247, 8902.07] + - [1250, 8902.07] - - [1024, 2816, 1, 1024] - - [1245, 8906.85] + - [1248, 8906.85] - - [1024, 2624, 1, 1024] - - [1247, 8494.31] + - [1250, 8494.31] - - [1024, 2144, 1, 1024] - - [1230, 8243.46] + - [1233, 8243.46] - - [36548, 1131, 1, 1024] - - [1246, 10104.5] + - [1249, 10104.5] - - [4096, 25, 1, 3072] - - [1261, 3959.88] + - [1264, 3959.88] - - [1024, 64, 1, 1024] - - [1274, 3410.0] + - [1277, 3410.0] - - [1024, 3296, 1, 1024] - - [1245, 9066.42] + - [1248, 9066.42] - - [36548, 4992, 1, 1024] - - [1239, 10395.5] + - [1242, 10395.5] - - [1024, 1344, 1, 1024] - - [1240, 8522.56] + - [1243, 8522.56] - - [36548, 2401, 1, 1024] - - [1239, 10250.2] + - [1242, 10250.2] - - [1024, 15744, 1, 1024] - - [1239, 10006.3] + - [1242, 10006.3] - - [1024, 15232, 1, 1024] - - [1238, 9912.11] + - [1241, 9912.11] - - [1024, 1888, 1, 1024] - - [1250, 8962.88] + - [1253, 8962.88] - - [1024, 1792, 1, 1024] - - [1254, 8556.72] + - [1257, 8556.72] - - [36548, 1073, 1, 1024] - - [1239, 10161.1] + - [1242, 10161.1] - - [4096, 50, 1, 3072] - - [1266, 5882.06] + - [1269, 5882.06] - - [36548, 15488, 1, 1024] - - [1246, 10437.0] + - [1249, 10437.0] - - [1024, 2464, 1, 1024] - - [1243, 8879.92] + - [1246, 8879.92] - - [1024, 2272, 1, 1024] - - [1240, 8720.25] + - [1243, 8720.25] - - [1024, 13, 1, 1024] - - [1270, 774.516] + - [1273, 774.516] - - [1024, 2432, 1, 1024] - - [1245, 8491.43] + - [1248, 8491.43] - - [36548, 24, 1, 1024] - - [1232, 3564.31] + - [1235, 3564.31] - - [1024, 3936, 1, 1024] - - [1255, 9433.2] + - [1258, 9433.2] - - [36548, 13824, 1, 1024] - - [1239, 10439.7] + - [1242, 10439.7] - - [1024, 2401, 1, 1024] - - [1247, 8869.93] + - [1250, 8869.93] - - [1024, 32, 1, 1024] - - [1262, 1839.61] + - [1265, 1839.61] - - [1024, 2176, 1, 1024] - - [1231, 8544.45] + - [1234, 8544.45] - - [1024, 2240, 1, 1024] - - [1240, 8381.45] + - [1243, 8381.45] - - [1024, 1728, 1, 1024] - - [1228, 8212.23] + - [1231, 8212.23] - - [1024, 128, 1, 1024] - - [1275, 4660.34] + - [1278, 4660.34] - - [1024, 216, 1, 1024] - - [1265, 5777.87] + - [1268, 5777.87] - - [1024, 63, 1, 1024] - - [1273, 3329.65] + - [1276, 3329.65] - - [1024, 86, 1, 1024] - - [1279, 3533.6] + - [1282, 3533.6] - - [1024, 2528, 1, 1024] - - [1235, 8789.15] + - [1238, 8789.15] - - [1024, 2400, 1, 1024] - - [1240, 8939.3] + - [1243, 8939.3] - - [1024, 1440, 1, 1024] - - [1247, 9131.31] + - [1250, 9131.31] - - [1024, 2912, 1, 1024] - - [1240, 9139.93] + - [1243, 9139.93] - - [4096, 35, 1, 2048] - - [1266, 4059.75] + - [1269, 4059.75] - - [4096, 63, 1, 2048] - - [1268, 6946.4] + - [1271, 6946.4] - - [1024, 2880, 1, 1024] - - [1238, 9104.88] + - [1241, 9104.88] - - [1024, 4064, 1, 1024] - - [1257, 9715.1] + - [1260, 9715.1] - - [1024, 4655, 1, 1024] - - [1245, 9033.8] + - [1248, 9033.8] - - [1024, 1088, 1, 1024] - - [1229, 8144.31] + - [1232, 8144.31] - - [36548, 6272, 1, 1024] - - [1246, 10427.3] + - [1249, 10427.3] - - [1024, 1, 1, 13] - - [1259, 0.0] + - [1262, 0.0] - - [768, 512, 1, 768] - - [1283, 5889.04] + - [1286, 5889.04] - - [768, 2048, 1, 3072] - - [1293, 9394.62] + - [1296, 9394.62] - - [768, 32, 1, 768] - - [1305, 1502.74] + - [1308, 1502.74] - - [64, 128, 96, 128] - - [1300, 4973.48] + - [1303, 4973.48] - - [3072, 1024, 1, 768] - - [1294, 9856.07] + - [1297, 9856.07] - - [768, 1024, 1, 3072] - - [1287, 8611.06] + - [1290, 8611.06] - - [768, 512, 1, 3072] - - [1286, 6430.79] + - [1289, 6430.79] - - [768, 64, 1, 768] - - [1307, 2621.44] + - [1310, 2621.44] - - [768, 4096, 1, 3072] - - [1292, 10030.4] + - [1295, 10030.4] - - [768, 2048, 1, 2] - - [1285, 381.763] + - [1288, 381.763] - - [768, 2048, 1, 768] - - [1290, 9754.2] + - [1293, 9754.2] - - [768, 320, 1, 30522] - - [1303, 8529.4] + - [1306, 8529.4] - - [64, 64, 96, 64] - - [1297, 2496.61] + - [1300, 2496.61] - - [768, 640, 1, 30522] - - [1284, 8253.84] + - [1287, 8253.84] - - [768, 1280, 1, 30522] - - [1289, 9572.85] + - [1292, 9572.85] - - [768, 1280, 1, 768] - - [1293, 8713.93] + - [1296, 8713.93] - - [768, 640, 1, 768] - - [1283, 7293.03] + - [1286, 7293.03] - - [768, 32, 1, 2] - - [1295, 11.8154] + - [1298, 11.8154] - - [3072, 2048, 1, 768] - - [1290, 10019.6] + - [1293, 10019.6] - - [768, 4096, 1, 768] - - [1290, 9927.35] + - [1293, 9927.35] - - [3072, 4096, 1, 768] - - [1293, 10150.1] + - [1296, 10150.1] - - [64, 256, 192, 256] - - [1299, 7054.19] + - [1302, 7054.19] - - [768, 8, 1, 768] - - [1306, 340.939] + - [1309, 340.939] - - [64, 128, 384, 128] - - [1298, 6765.01] + - [1301, 6765.01] - - [768, 1024, 1, 768] - - [1288, 8768.58] + - [1291, 8768.58] - - [768, 320, 1, 768] - - [1304, 6838.54] + - [1307, 6838.54] - - [64, 64, 768, 64] - - [1301, 5388.83] + - [1304, 5388.83] - - [768, 1024, 1, 2] - - [1281, 258.695] + - [1284, 258.695] - - [768, 16, 1, 768] - - [1306, 819.2] + - [1309, 819.2] - - [64, 256, 96, 256] - - [1299, 5893.64] + - [1302, 5893.64] - - [3072, 512, 1, 768] - - [1291, 9722.79] + - [1294, 9722.79] - - [768, 160, 1, 768] - - [1308, 5019.78] + - [1311, 5019.78] - - [768, 4096, 1, 2] - - [1282, 507.375] + - [1285, 507.375] - - [1600, 512, 1, 1024] - - [1312, 7186.95] + - [1315, 7186.95] - - [1024, 512, 1, 64] - - [1310, 2557.5] + - [1313, 2557.5] - - [1024, 512, 1, 1] - - [1309, 71.2348] + - [1312, 71.2348] - - [2048, 512, 1, 1] - - [1311, 90.3945] + - [1314, 90.3945] - - [1024, 200, 1, 1] - - [1317, 40.0] + - [1320, 40.0] - - [32, 200, 1, 1] - - [1313, 1.56863] + - [1316, 1.56863] - - [560, 200, 1, 1024] - - [1321, 4731.35] + - [1324, 4731.35] - - [1, 512, 1, 1] - - [1320, 0.130612] + - [1323, 0.130612] - - [64, 512, 1, 1] - - [1315, 7.58519] + - [1318, 7.58519] - - [1024, 8192, 1, 256] - - [1330, 9518.99] + - [1333, 9518.99] - - [1024, 22016, 1, 256] - - [1336, 9881.12] + - [1339, 9881.12] - - [256, 8976, 1, 4352] - - [1328, 9567.08] + - [1331, 9567.08] - - [512, 256, 1, 2048] - - [1341, 5917.89] + - [1344, 5917.89] - - [1024, 19968, 1, 256] - - [1336, 9882.37] + - [1339, 9882.37] - - [256, 8976, 1, 1536] - - [1326, 8437.35] + - [1329, 8437.35] - - [256, 8976, 1, 33536] - - [1326, 8441.89] + - [1329, 8441.89] - - [1024, 1792, 1, 256] - - [1326, 7756.97] + - [1329, 7756.97] - - [1024, 21504, 1, 256] - - [1336, 9893.9] + - [1339, 9893.9] - - [512, 215, 1, 2048] - - [1342, 4665.64] + - [1345, 4665.64] - - [1024, 7168, 1, 256] - - [1330, 9509.35] + - [1333, 9509.35] - - [256, 8976, 1, 15872] - - [1332, 8914.65] + - [1335, 8914.65] - - [1024, 19712, 1, 256] - - [1336, 9771.9] + - [1339, 9771.9] - - [256, 8976, 1, 5632] - - [1332, 8740.03] + - [1335, 8740.03] - - [1024, 14848, 1, 256] - - [1336, 9756.15] + - [1339, 9756.15] - - [1024, 28672, 1, 256] - - [1336, 9958.92] + - [1339, 9958.92] - - [256, 8976, 1, 9728] - - [1339, 8853.04] + - [1342, 8853.04] - - [1024, 17152, 1, 256] - - [1330, 9737.3] + - [1333, 9737.3] - - [256, 8976, 1, 11520] - - [1332, 8999.2] + - [1335, 8999.2] - - [256, 8976, 1, 8192] - - [1322, 7897.32] + - [1325, 7897.32] - - [1024, 3328, 1, 256] - - [1337, 8593.53] + - [1340, 8593.53] - - [256, 8976, 1, 7424] - - [1332, 8980.47] + - [1335, 8980.47] - - [1024, 18944, 1, 256] - - [1336, 9854.85] + - [1339, 9854.85] - - [1024, 10496, 1, 256] - - [1331, 9453.9] + - [1334, 9453.9] - - [256, 8976, 1, 5376] - - [1329, 9608.37] + - [1332, 9608.37] - - [256, 8976, 1, 6144] - - [1326, 7880.13] + - [1329, 7880.13] - - [1024, 40448, 1, 256] - - [1336, 10016.6] + - [1339, 10016.6] - - [256, 8976, 1, 22016] - - [1339, 8939.87] + - [1342, 8939.87] - - [256, 8976, 1, 4864] - - [1327, 9211.43] + - [1330, 9211.43] - - [256, 8976, 1, 12288] - - [1323, 8065.05] + - [1326, 8065.05] - - [1024, 9728, 1, 256] - - [1336, 9636.25] + - [1339, 9636.25] - - [256, 8976, 1, 2048] - - [1324, 7001.33] + - [1327, 7001.33] - - [1024, 10240, 1, 256] - - [1330, 9619.96] + - [1333, 9619.96] - - [256, 8976, 1, 2304] - - [1328, 9509.74] + - [1331, 9509.74] - - [1024, 7936, 1, 256] - - [1336, 9300.67] + - [1339, 9300.67] - - [768, 256, 1, 2048] - - [1340, 6267.95] + - [1343, 6267.95] - - [1024, 9984, 1, 256] - - [1336, 9477.28] + - [1339, 9477.28] - - [1024, 13312, 1, 256] - - [1336, 9758.56] + - [1339, 9758.56] - - [1024, 16128, 1, 256] - - [1330, 9721.9] + - [1333, 9721.9] - - [1024, 8960, 1, 256] - - [1331, 9398.25] + - [1334, 9398.25] - - [1024, 5120, 1, 256] - - [1337, 9315.5] + - [1340, 9315.5] - - [1024, 11264, 1, 256] - - [1330, 9664.8] + - [1333, 9664.8] - - [256, 8976, 1, 20480] - - [1338, 8279.87] + - [1341, 8279.87] - - [1024, 20992, 1, 256] - - [1330, 9878.87] + - [1333, 9878.87] - - [256, 8976, 1, 9472] - - [1332, 8990.96] + - [1335, 8990.96] - - [256, 8976, 1, 8448] - - [1332, 8983.52] + - [1335, 8983.52] - - [256, 8976, 1, 20992] - - [1333, 8942.11] + - [1336, 8942.11] - - [256, 8976, 1, 10496] - - [1333, 8989.71] + - [1336, 8989.71] - - [1024, 15104, 1, 256] - - [1331, 9676.01] + - [1334, 9676.01] - - [1024, 6400, 1, 256] - - [1339, 9145.89] + - [1342, 9145.89] - - [1024, 4096, 1, 256] - - [1332, 9124.25] + - [1335, 9124.25] - - [256, 8976, 1, 2560] - - [1326, 8566.11] + - [1329, 8566.11] - - [256, 8976, 1, 2816] - - [1328, 9496.84] + - [1331, 9496.84] - - [1024, 7680, 1, 256] - - [1336, 9460.84] + - [1339, 9460.84] - - [256, 8976, 1, 14336] - - [1333, 8226.8] + - [1336, 8226.8] - - [256, 8976, 1, 6656] - - [1333, 8771.42] + - [1336, 8771.42] - - [1024, 3072, 1, 256] - - [1333, 9076.94] + - [1336, 9076.94] - - [256, 8976, 1, 5888] - - [1329, 9546.3] + - [1332, 9546.3] - - [1024, 12288, 1, 256] - - [1330, 9690.81] + - [1333, 9690.81] - - [256, 8976, 1, 26112] - - [1335, 8699.83] + - [1338, 8699.83] - - [1024, 7424, 1, 256] - - [1337, 9256.84] + - [1340, 9256.84] - - [256, 8976, 1, 14848] - - [1338, 8885.79] + - [1341, 8885.79] - - [768, 215, 1, 2048] - - [1340, 5628.59] + - [1343, 5628.59] - - [1024, 2560, 1, 256] - - [1333, 8820.83] + - [1336, 8820.83] - - [256, 8976, 1, 19968] - - [1332, 8928.86] + - [1335, 8928.86] - - [256, 8976, 1, 9984] - - [1332, 8993.12] + - [1335, 8993.12] - - [1024, 4864, 1, 256] - - [1333, 8974.3] + - [1336, 8974.3] - - [1024, 33536, 1, 256] - - [1336, 9943.07] + - [1339, 9943.07] - - [256, 8976, 1, 15104] - - [1333, 8996.63] + - [1336, 8996.63] - - [1024, 2048, 1, 256] - - [1331, 8462.66] + - [1334, 8462.66] - - [256, 8976, 1, 8960] - - [1333, 8998.92] + - [1336, 8998.92] - - [1024, 6144, 1, 256] - - [1338, 9359.67] + - [1341, 9359.67] - - [1024, 14592, 1, 256] - - [1336, 9667.42] + - [1339, 9667.42] - - [256, 8976, 1, 19712] - - [1332, 9020.11] + - [1335, 9020.11] - - [1024, 11520, 1, 256] - - [1331, 9527.7] + - [1334, 9527.7] - - [1024, 5632, 1, 256] - - [1330, 9297.2] + - [1333, 9297.2] - - [256, 8976, 1, 11008] - - [1339, 8994.8] + - [1342, 8994.8] - - [256, 8976, 1, 17152] - - [1333, 9003.8] + - [1336, 9003.8] - - [256, 8976, 1, 3072] - - [1322, 8261.96] + - [1325, 8261.96] - - [1024, 3840, 1, 256] - - [1339, 8671.89] + - [1342, 8671.89] - - [1024, 14336, 1, 256] - - [1336, 9760.28] + - [1339, 9760.28] - - [1024, 20480, 1, 256] - - [1330, 9887.85] + - [1333, 9887.85] - - [1024, 23552, 1, 256] - - [1330, 9890.46] + - [1333, 9890.46] - - [256, 8976, 1, 7168] - - [1325, 8478.34] + - [1328, 8478.34] - - [1024, 13568, 1, 256] - - [1330, 9654.64] + - [1333, 9654.64] - - [1024, 4608, 1, 256] - - [1338, 9218.25] + - [1341, 9218.25] - - [256, 8976, 1, 10240] - - [1323, 8076.16] + - [1326, 8076.16] - - [1024, 8704, 1, 256] - - [1332, 9475.5] + - [1335, 9475.5] - - [1024, 11008, 1, 256] - - [1336, 9524.96] + - [1339, 9524.96] - - [1024, 8448, 1, 256] - - [1330, 9352.16] + - [1333, 9352.16] - - [256, 8976, 1, 44505] - - [1334, 8430.23] + - [1337, 8430.23] - - [6272, 256, 1, 528] - - [1386, 7389.94] + - [1389, 7389.94] - - [3136, 2048, 1, 1024] - - [1367, 9657.94] + - [1370, 9657.94] - - [6272, 112, 1, 512] - - [1365, 5931.09] + - [1368, 5931.09] - - [2048, 320, 1, 1280] - - [1385, 7772.99] + - [1388, 7772.99] - - [289, 256, 1, 1568] - - [1406, 3718.17] + - [1409, 3718.17] - - [50176, 128, 1, 256] - - [1368, 8908.58] + - [1371, 8908.58] - - [5329, 64, 1, 448] - - [1351, 4602.2] + - [1354, 4602.2] - - [289, 192, 1, 1344] - - [1403, 3452.59] + - [1406, 3452.59] - - [12544, 1024, 1, 256] - - [1368, 9742.64] + - [1371, 9742.64] - - [784, 64, 32, 192] - - [1344, 6844.61] + - [1347, 6844.61] - - [6272, 64, 1, 480] - - [1352, 5562.24] + - [1355, 5562.24] - - [196, 128, 1, 800] - - [1394, 1639.74] + - [1397, 1639.74] - - [64, 512, 1, 1344] - - [1393, 2313.04] + - [1396, 2313.04] - - [6272, 64, 1, 512] - - [1351, 5609.19] + - [1354, 5609.19] - - [6272, 160, 1, 528] - - [1352, 6149.7] + - [1355, 6149.7] - - [289, 160, 32, 768] - - [1379, 6637.82] + - [1382, 6637.82] - - [12544, 256, 1, 1024] - - [1386, 8790.46] + - [1389, 8790.46] - - [289, 224, 1, 1568] - - [1406, 3270.17] + - [1409, 3270.17] - - [5329, 64, 32, 160] - - [1359, 9091.04] + - [1362, 9091.04] - - [5329, 96, 1, 576] - - [1386, 5555.66] + - [1389, 5555.66] - - [3025, 64, 1, 363] - - [1404, 4392.3] + - [1407, 4392.3] - - [784, 32, 32, 192] - - [1375, 5633.8] + - [1378, 5633.8] - - [3136, 512, 1, 1024] - - [1371, 7553.14] + - [1374, 7553.14] - - [6272, 16, 1, 480] - - [1406, 3219.85] + - [1409, 3219.85] - - [1225, 64, 32, 288] - - [1366, 8240.58] + - [1369, 8240.58] - - [64, 256, 1, 1536] - - [1399, 1456.36] + - [1402, 1456.36] - - [289, 192, 32, 768] - - [1378, 7372.8] + - [1381, 7372.8] - - [2048, 448, 1, 1280] - - [1361, 8403.01] + - [1364, 8403.01] - - [3136, 2048, 1, 512] - - [1360, 9486.31] + - [1363, 9486.31] - - [289, 256, 1, 2016] - - [1406, 3876.08] + - [1409, 3876.08] - - [289, 384, 32, 1024] - - [1345, 7350.54] + - [1348, 7350.54] - - [1568, 32, 1, 832] - - [1395, 2717.87] + - [1398, 2717.87] - - [3136, 64, 32, 64] - - [1348, 7657.26] + - [1351, 7657.26] - - [289, 160, 1, 1120] - - [1402, 2826.9] + - [1405, 2826.9] - - [6272, 128, 1, 528] - - [1356, 6926.26] + - [1359, 6926.26] - - [21609, 32, 1, 288] - - [1357, 3698.9] + - [1360, 3698.9] - - [1225, 192, 1, 1728] - - [1390, 7309.81] + - [1393, 7309.81] - - [4096, 512, 1, 4096] - - [1373, 10272.1] + - [1376, 10272.1] - - [64, 256, 1, 1152] - - [1399, 1387.82] + - [1402, 1387.82] - - [6272, 96, 1, 480] - - [1387, 6371.56] + - [1390, 6371.56] - - [784, 96, 1, 800] - - [1407, 3330.27] + - [1410, 3330.27] - - [2048, 448, 1, 2048] - - [1361, 8622.65] + - [1364, 8622.65] - - [784, 96, 32, 192] - - [1376, 7092.36] + - [1379, 7092.36] - - [289, 224, 1, 1344] - - [1406, 3180.01] + - [1409, 3180.01] - - [1001, 512, 1, 4096] - - [1347, 8195.07] + - [1350, 8195.07] - - [2048, 192, 1, 1280] - - [1352, 6120.09] + - [1355, 6120.09] - - [1225, 64, 32, 256] - - [1357, 8076.62] + - [1360, 8076.62] - - [2048, 256, 1, 1536] - - [1347, 8137.7] + - [1350, 8137.7] - - [1225, 64, 1, 1200] - - [1406, 3552.87] + - [1409, 3552.87] - - [6272, 128, 1, 512] - - [1360, 6878.21] + - [1363, 6878.21] - - [729, 192, 1, 1600] - - [1405, 5016.77] + - [1408, 5016.77] - - [289, 192, 1, 896] - - [1403, 3091.87] + - [1406, 3091.87] - - [1568, 384, 1, 832] - - [1386, 6934.62] + - [1389, 6934.62] - - [784, 16, 32, 192] - - [1377, 3380.28] + - [1380, 3380.28] - - [1568, 256, 1, 832] - - [1351, 5980.86] + - [1354, 5980.86] - - [1568, 48, 1, 832] - - [1408, 3275.09] + - [1411, 3275.09] - - [1568, 192, 1, 832] - - [1346, 4441.11] + - [1349, 4441.11] - - [289, 192, 32, 1024] - - [1349, 6563.06] + - [1352, 6563.06] - - [6272, 32, 1, 528] - - [1390, 4998.67] + - [1393, 4998.67] - - [49, 128, 1, 1200] - - [1391, 550.175] + - [1394, 550.175] - - [1225, 64, 32, 384] - - [1363, 8589.33] + - [1366, 8589.33] - - [289, 128, 1, 896] - - [1402, 2103.1] + - [1405, 2103.1] - - [1568, 160, 1, 832] - - [1390, 6995.05] + - [1393, 6995.05] - - [1001, 32, 1, 1024] - - [1399, 1744.72] + - [1402, 1744.72] - - [2048, 320, 1, 2048] - - [1384, 7118.04] + - [1387, 7118.04] - - [2048, 384, 1, 1536] - - [1347, 8184.01] + - [1350, 8184.01] - - [50176, 512, 1, 256] - - [1359, 9852.4] + - [1362, 9852.4] - - [289, 256, 1, 1792] - - [1408, 3809.75] + - [1411, 3809.75] - - [64, 448, 1, 1152] - - [1400, 2128.23] + - [1403, 2128.23] - - [5041, 96, 1, 576] - - [1385, 5279.3] + - [1388, 5279.3] - - [6272, 192, 1, 480] - - [1347, 7479.65] + - [1350, 7479.65] - - [784, 32, 32, 256] - - [1374, 5708.91] + - [1377, 5708.91] - - [1001, 32, 1, 2048] - - [1401, 2141.04] + - [1404, 2141.04] - - [289, 192, 1, 1120] - - [1397, 3277.77] + - [1400, 3277.77] - - [6272, 32, 1, 512] - - [1389, 4978.7] + - [1392, 4978.7] - - [289, 384, 1, 3456] - - [1406, 5904.14] + - [1409, 5904.14] - - [289, 384, 1, 2592] - - [1407, 5707.34] + - [1410, 5707.34] - - [12544, 1024, 1, 512] - - [1368, 10008.3] + - [1371, 10008.3] - - [12544, 256, 1, 512] - - [1386, 8628.08] + - [1389, 8628.08] - - [6272, 24, 1, 512] - - [1390, 3568.07] + - [1393, 3568.07] - - [5041, 192, 1, 720] - - [1361, 8424.42] + - [1364, 8424.42] - - [64, 320, 1, 1728] - - [1394, 1469.66] + - [1397, 1469.66] - - [784, 128, 32, 256] - - [1362, 8104.14] + - [1365, 8104.14] - - [289, 96, 1, 864] - - [1400, 1838.25] + - [1403, 1838.25] - - [1225, 32, 32, 192] - - [1381, 5949.72] + - [1384, 5949.72] - - [1568, 128, 1, 832] - - [1389, 5718.69] + - [1392, 5718.69] - - [289, 128, 32, 768] - - [1347, 7289.25] + - [1350, 7289.25] - - [196, 64, 1, 800] - - [1393, 915.62] + - [1396, 915.62] - - [4096, 512, 1, 9216] - - [1370, 10351.4] + - [1373, 10351.4] - - [12544, 64, 1, 147] - - [1360, 5069.33] + - [1363, 5069.33] - - [784, 32, 1, 400] - - [1391, 1140.36] + - [1394, 1140.36] - - [6272, 160, 1, 512] - - [1351, 6140.08] + - [1354, 6140.08] - - [1225, 48, 32, 288] - - [1357, 5978.61] + - [1360, 5978.61] - - [64, 320, 1, 2880] - - [1398, 1920.0] + - [1401, 1920.0] - - [1225, 64, 32, 192] - - [1351, 7641.01] + - [1354, 7641.01] - - [1001, 32, 1, 1536] - - [1399, 2084.79] + - [1402, 2084.79] - - [784, 64, 32, 256] - - [1343, 6990.51] + - [1346, 6990.51] - - [64, 384, 1, 1152] - - [1400, 1862.6] + - [1403, 1862.6] - - [3136, 512, 1, 2048] - - [1372, 7764.3] + - [1375, 7764.3] - - [6272, 144, 1, 512] - - [1347, 5574.04] + - [1350, 5574.04] - - [1225, 192, 32, 384] - - [1361, 9373.83] + - [1364, 9373.83] - - [64, 192, 1, 1728] - - [1399, 1206.46] + - [1402, 1206.46] - - [8192, 320, 1, 1280] - - [1413, 9875.92] + - [1416, 9875.92] - - [8192, 320, 1, 2048] - - [1416, 9745.7] + - [1419, 9745.7] - - [8192, 384, 1, 1280] - - [1413, 10046.2] + - [1416, 10046.2] - - [8192, 192, 1, 1280] - - [1416, 9950.9] + - [1419, 9950.9] - - [8192, 192, 1, 2048] - - [1412, 9559.67] + - [1415, 9559.67] - - [8192, 384, 1, 2048] - - [1414, 9945.74] + - [1417, 9945.74] - - [8192, 448, 1, 2048] - - [1415, 9908.51] + - [1418, 9908.51] - - [1001, 64, 1, 1536] - - [1409, 3649.94] + - [1412, 3649.94] - - [8192, 448, 1, 1280] - - [1413, 9981.35] + - [1416, 9981.35] - - [1001, 64, 1, 2048] - - [1410, 3580.87] + - [1413, 3580.87] - - [1001, 128, 1, 2048] - - [1411, 5587.87] + - [1414, 5587.87] - - [3200, 1024, 1, 2048] - - [1419, 9131.95] + - [1422, 9131.95] - - [2048, 1024, 1, 256] - - [1418, 8452.0] + - [1421, 8452.0] - - [257, 1024, 1, 4096] - - [1417, 4225.21] + - [1420, 4225.21] - - [3136, 64, 64, 64] - - [1420, 8028.16] + - [1423, 8028.16] - - [1225, 32, 64, 192] - - [1426, 6968.89] + - [1429, 6968.89] - - [3136, 64, 64, 256] - - [1421, 9678.4] + - [1424, 9678.4] - - [3136, 256, 64, 64] - - [1422, 8998.29] + - [1425, 8998.29] - - [1225, 64, 64, 288] - - [1425, 8893.83] + - [1428, 8893.83] - - [289, 128, 64, 768] - - [1423, 8442.75] + - [1426, 8442.75] - - [5329, 80, 64, 64] - - [1427, 6687.37] + - [1430, 6687.37] - - [1225, 64, 64, 192] - - [1424, 8339.5] + - [1427, 8339.5] - - [1225, 64, 64, 256] - - [1428, 8721.52] + - [1431, 8721.52] + - - [65, 6400, 1, 1024] + - [1432, 2839.89] + - - [256, 6400, 1, 4096] + - [1433, 7361.66] + - - [1024, 64, 1, 4096] + - [1434, 3787.18] - null diff --git a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml index b02f82523..c1422a670 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/vega20_Cijk_Alik_Bljk_SB.yaml @@ -96477,6 +96477,539 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 590 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL1_GRVW2_GSU1_PGR0_PLR1_TT8_4_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 591 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 1 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 + LVCB: 8 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 2 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 8 + MacroTile0: 64 + MacroTile1: 32 + MacroTileA: 64 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 592 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_8_2_WGM8 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -96601,7 +97134,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 590 + SolutionIndex: 593 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW02_GSU32_SNLL0_TT04_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -96750,7 +97283,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 591 + SolutionIndex: 594 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x32_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_02_08 SubGroup0: 16 SubGroup1: 2 @@ -96895,7 +97428,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 592 + SolutionIndex: 595 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -97044,7 +97577,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 593 + SolutionIndex: 596 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x64_GRVW04_GSU32_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -97193,7 +97726,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 594 + SolutionIndex: 597 SolutionNameMin: Cijk_Alik_Bljk_SB_MT036x048x16_GRVW01_GSU02_SNLL0_TT03_03_VW01_WG12_16_01 SubGroup0: 12 SubGroup1: 16 @@ -97342,7 +97875,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 595 + SolutionIndex: 598 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x036x16_GRVW01_GSU08_SNLL0_TT06_03_VW01_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -97491,7 +98024,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 596 + SolutionIndex: 599 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW02_GSU08_SNLL0_TT06_04_VW02_WG08_12_02 SubGroup0: 8 SubGroup1: 12 @@ -97640,7 +98173,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 597 + SolutionIndex: 600 SolutionNameMin: Cijk_Alik_Bljk_SB_MT048x048x16_GRVW01_GSU08_SNLL0_TT03_03_VW01_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -97789,7 +98322,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 598 + SolutionIndex: 601 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -97938,7 +98471,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 599 + SolutionIndex: 602 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98087,7 +98620,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 600 + SolutionIndex: 603 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98236,7 +98769,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 601 + SolutionIndex: 604 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98385,7 +98918,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 602 + SolutionIndex: 605 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -98534,7 +99067,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 603 + SolutionIndex: 606 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98683,7 +99216,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 604 + SolutionIndex: 607 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -98832,7 +99365,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 605 + SolutionIndex: 608 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x08_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -98981,7 +99514,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 606 + SolutionIndex: 609 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -99130,7 +99663,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 607 + SolutionIndex: 610 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -99279,7 +99812,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 608 + SolutionIndex: 611 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99428,7 +99961,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 609 + SolutionIndex: 612 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99577,7 +100110,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 610 + SolutionIndex: 613 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -99726,7 +100259,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 611 + SolutionIndex: 614 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -99875,7 +100408,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 612 + SolutionIndex: 615 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU01_SNLL0_TT04_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100024,7 +100557,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 613 + SolutionIndex: 616 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100173,7 +100706,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 614 + SolutionIndex: 617 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG32_04_01 SubGroup0: 32 SubGroup1: 4 @@ -100322,7 +100855,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 615 + SolutionIndex: 618 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_04_01 SubGroup0: 16 SubGroup1: 4 @@ -100471,7 +101004,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 616 + SolutionIndex: 619 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100620,7 +101153,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 617 + SolutionIndex: 620 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_08_01 SubGroup0: 16 SubGroup1: 8 @@ -100769,7 +101302,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 618 + SolutionIndex: 621 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -100918,7 +101451,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 619 + SolutionIndex: 622 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101067,7 +101600,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 620 + SolutionIndex: 623 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x08_GRVW04_GSU08_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101216,7 +101749,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 621 + SolutionIndex: 624 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101365,7 +101898,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 622 + SolutionIndex: 625 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101514,7 +102047,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 623 + SolutionIndex: 626 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -101663,7 +102196,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 624 + SolutionIndex: 627 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101812,7 +102345,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 625 + SolutionIndex: 628 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -101961,7 +102494,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 626 + SolutionIndex: 629 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102110,7 +102643,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 627 + SolutionIndex: 630 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102259,7 +102792,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 628 + SolutionIndex: 631 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -102408,7 +102941,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 629 + SolutionIndex: 632 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102557,7 +103090,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 630 + SolutionIndex: 633 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102706,7 +103239,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 631 + SolutionIndex: 634 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -102855,7 +103388,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 632 + SolutionIndex: 635 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -103004,7 +103537,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 633 + SolutionIndex: 636 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -103153,7 +103686,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 634 + SolutionIndex: 637 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -103302,7 +103835,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 635 + SolutionIndex: 638 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -103451,7 +103984,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 636 + SolutionIndex: 639 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -103600,7 +104133,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 637 + SolutionIndex: 640 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -103749,7 +104282,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 638 + SolutionIndex: 641 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -103898,7 +104431,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 639 + SolutionIndex: 642 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104047,7 +104580,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 640 + SolutionIndex: 643 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104196,7 +104729,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 641 + SolutionIndex: 644 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -104345,7 +104878,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 642 + SolutionIndex: 645 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104494,7 +105027,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 643 + SolutionIndex: 646 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x16_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104643,7 +105176,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 644 + SolutionIndex: 647 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -104792,7 +105325,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 645 + SolutionIndex: 648 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -104941,7 +105474,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 646 + SolutionIndex: 649 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT04_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105090,7 +105623,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 647 + SolutionIndex: 650 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105239,7 +105772,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 648 + SolutionIndex: 651 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105388,7 +105921,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 649 + SolutionIndex: 652 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105537,7 +106070,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 650 + SolutionIndex: 653 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -105686,7 +106219,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 651 + SolutionIndex: 654 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x016x16_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -105835,7 +106368,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 652 + SolutionIndex: 655 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -105984,7 +106517,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 653 + SolutionIndex: 656 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106133,7 +106666,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 654 + SolutionIndex: 657 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106282,7 +106815,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 655 + SolutionIndex: 658 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x24_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106431,7 +106964,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 656 + SolutionIndex: 659 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -106580,7 +107113,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 657 + SolutionIndex: 660 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -106729,7 +107262,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 658 + SolutionIndex: 661 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -106878,7 +107411,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 659 + SolutionIndex: 662 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -107027,7 +107560,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 660 + SolutionIndex: 663 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107176,7 +107709,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 661 + SolutionIndex: 664 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107325,7 +107858,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 662 + SolutionIndex: 665 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU02_SNLL0_TT08_08_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -107474,7 +108007,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 663 + SolutionIndex: 666 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU02_SNLL0_TT04_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -107623,7 +108156,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 664 + SolutionIndex: 667 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG16_04_04 SubGroup0: 16 SubGroup1: 4 @@ -107772,7 +108305,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 665 + SolutionIndex: 668 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU04_SNLL0_TT08_08_VW04_WG08_08_02 SubGroup0: 8 SubGroup1: 8 @@ -107921,7 +108454,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 666 + SolutionIndex: 669 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108070,7 +108603,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 667 + SolutionIndex: 670 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108219,7 +108752,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 668 + SolutionIndex: 671 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT08_04_VW04_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -108368,7 +108901,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 669 + SolutionIndex: 672 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW04_GSU04_SNLL0_TT04_04_VW04_WG16_08_02 SubGroup0: 16 SubGroup1: 8 @@ -108517,7 +109050,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 670 + SolutionIndex: 673 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108666,7 +109199,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 671 + SolutionIndex: 674 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108815,7 +109348,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 672 + SolutionIndex: 675 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL0_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -108964,7 +109497,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 673 + SolutionIndex: 676 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109113,7 +109646,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 674 + SolutionIndex: 677 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109262,7 +109795,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 675 + SolutionIndex: 678 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL0_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109411,7 +109944,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 676 + SolutionIndex: 679 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL0_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109560,7 +110093,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 677 + SolutionIndex: 680 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109709,7 +110242,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 678 + SolutionIndex: 681 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -109858,7 +110391,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 679 + SolutionIndex: 682 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110007,7 +110540,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 680 + SolutionIndex: 683 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x08_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110156,7 +110689,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 681 + SolutionIndex: 684 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110305,7 +110838,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 682 + SolutionIndex: 685 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110454,7 +110987,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 683 + SolutionIndex: 686 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110603,7 +111136,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 684 + SolutionIndex: 687 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110752,7 +111285,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 685 + SolutionIndex: 688 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -110901,7 +111434,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 686 + SolutionIndex: 689 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111050,7 +111583,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 687 + SolutionIndex: 690 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x16_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111199,7 +111732,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 688 + SolutionIndex: 691 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x16_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111348,7 +111881,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 689 + SolutionIndex: 692 SolutionNameMin: Cijk_Alik_Bljk_SB_MT096x128x16_GRVW02_GSU01_SNLL1_TT06_08_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111497,7 +112030,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 690 + SolutionIndex: 693 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_SNLL1_TT08_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111646,7 +112179,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 691 + SolutionIndex: 694 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x096x16_GRVW02_GSU01_SNLL1_TT08_06_VW02_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111795,7 +112328,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 692 + SolutionIndex: 695 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -111944,7 +112477,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 693 + SolutionIndex: 696 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112093,7 +112626,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 694 + SolutionIndex: 697 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x24_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112242,7 +112775,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 695 + SolutionIndex: 698 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112391,7 +112924,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 696 + SolutionIndex: 699 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112540,7 +113073,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 697 + SolutionIndex: 700 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112689,7 +113222,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 698 + SolutionIndex: 701 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x064x32_GRVW04_GSU01_SNLL1_TT04_04_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112838,7 +113371,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 699 + SolutionIndex: 702 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x128x32_GRVW04_GSU01_SNLL1_TT04_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -112987,7 +113520,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 700 + SolutionIndex: 703 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x32_GRVW04_GSU01_SNLL1_TT08_08_VW04_WG16_16_01 SubGroup0: 16 SubGroup1: 16 @@ -113136,7 +113669,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 701 + SolutionIndex: 704 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x04_GRVW02_GSU01_SNLL0_TT02_02_VW02_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -113285,7 +113818,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 702 + SolutionIndex: 705 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x032x02_GRVW04_GSU01_SNLL0_TT04_04_VW04_WG08_08_01 SubGroup0: 8 SubGroup1: 8 @@ -113434,7 +113967,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 703 + SolutionIndex: 706 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU01_SNLL1_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -113583,7 +114116,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 704 + SolutionIndex: 707 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU02_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -113732,7 +114265,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 705 + SolutionIndex: 708 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU04_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -113881,7 +114414,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 706 + SolutionIndex: 709 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU08_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -114030,7 +114563,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 707 + SolutionIndex: 710 SolutionNameMin: Cijk_Alik_Bljk_SB_MT008x008x08_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG04_04_04 SubGroup0: 4 SubGroup1: 4 @@ -114179,7 +114712,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 708 + SolutionIndex: 711 SolutionNameMin: Cijk_Alik_Bljk_SB_MT016x016x16_GRVW02_GSU16_SNLL0_TT02_02_VW02_WG08_08_04 SubGroup0: 8 SubGroup1: 8 @@ -114324,7 +114857,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 709 + SolutionIndex: 712 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_08_02_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -114470,7 +115003,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 710 + SolutionIndex: 713 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x16_GRVW01_GSU04_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -114616,7 +115149,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 711 + SolutionIndex: 714 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -114762,7 +115295,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 712 + SolutionIndex: 715 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x016x32_GRVW01_GSU08_LPA01_LPB01_PGR1_PLR1_TT04_04_USFGRO01_VW01_WG16_04_04_WGM08 SubGroup0: 16 SubGroup1: 4 @@ -114908,7 +115441,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 713 + SolutionIndex: 716 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x32_GRVW01_GSU01_LPA02_LPB02_PGR1_PLR1_TT04_08_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -115054,7 +115587,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 714 + SolutionIndex: 717 SolutionNameMin: Cijk_Alik_Bljk_SB_MT032x008x32_GRVW01_GSU08_LPA02_LPB02_PGR1_PLR1_TT02_02_USFGRO01_VW02_WG16_04_04_WGM01 SubGroup0: 16 SubGroup1: 4 @@ -115200,7 +115733,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 715 + SolutionIndex: 718 SolutionNameMin: Cijk_Alik_Bljk_SB_MT064x032x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_04_USFGRO01_VW04_WG16_08_02_WGM01 SubGroup0: 16 SubGroup1: 8 @@ -115346,7 +115879,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 716 + SolutionIndex: 719 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW01_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO01_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -115492,7 +116025,7 @@ TransposeB: false UseBeta: true UseInitialStrides: false - SolutionIndex: 717 + SolutionIndex: 720 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x064x16_GRVW04_GSU01_LPA04_LPB04_PGR1_PLR1_TT04_08_USFGRO0_VW04_WG32_08_01_WGM01 SubGroup0: 32 SubGroup1: 8 @@ -115649,7 +116182,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 718 + SolutionIndex: 721 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -115811,7 +116344,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 719 + SolutionIndex: 722 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -115973,7 +116506,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 720 + SolutionIndex: 723 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116135,7 +116668,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 721 + SolutionIndex: 724 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116297,7 +116830,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 722 + SolutionIndex: 725 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116459,7 +116992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 723 + SolutionIndex: 726 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -116621,7 +117154,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 724 + SolutionIndex: 727 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -116783,7 +117316,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 725 + SolutionIndex: 728 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -116945,7 +117478,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 726 + SolutionIndex: 729 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -117107,7 +117640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 727 + SolutionIndex: 730 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -117269,7 +117802,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 728 + SolutionIndex: 731 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117431,7 +117964,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 729 + SolutionIndex: 732 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117593,7 +118126,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 730 + SolutionIndex: 733 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -117755,7 +118288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 731 + SolutionIndex: 734 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -117913,7 +118446,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 732 + SolutionIndex: 735 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -118075,7 +118608,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 733 + SolutionIndex: 736 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118237,7 +118770,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 734 + SolutionIndex: 737 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118399,7 +118932,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 735 + SolutionIndex: 738 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -118561,7 +119094,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 736 + SolutionIndex: 739 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_8_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -118723,7 +119256,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 737 + SolutionIndex: 740 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -118885,7 +119418,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 738 + SolutionIndex: 741 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -119047,7 +119580,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 739 + SolutionIndex: 742 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -119205,7 +119738,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 740 + SolutionIndex: 743 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119363,7 +119896,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 741 + SolutionIndex: 744 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU1_LPA2_LPB0_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119525,7 +120058,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 742 + SolutionIndex: 745 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119687,7 +120220,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 743 + SolutionIndex: 746 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -119849,7 +120382,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 744 + SolutionIndex: 747 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA2_LPB0_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -120011,7 +120544,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 745 + SolutionIndex: 748 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB0_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -120173,7 +120706,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 746 + SolutionIndex: 749 SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x64x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -120335,7 +120868,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 747 + SolutionIndex: 750 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120497,7 +121030,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 748 + SolutionIndex: 751 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -120659,7 +121192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 749 + SolutionIndex: 752 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120821,7 +121354,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 750 + SolutionIndex: 753 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -120979,7 +121512,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 751 + SolutionIndex: 754 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -121141,7 +121674,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 752 + SolutionIndex: 755 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -121303,7 +121836,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 753 + SolutionIndex: 756 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121461,7 +121994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 754 + SolutionIndex: 757 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB0_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121623,7 +122156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 755 + SolutionIndex: 758 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB0_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121785,7 +122318,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 756 + SolutionIndex: 759 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB0_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -121947,7 +122480,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 757 + SolutionIndex: 760 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -122109,7 +122642,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 758 + SolutionIndex: 761 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -122271,7 +122804,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 759 + SolutionIndex: 762 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -122433,7 +122966,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 760 + SolutionIndex: 763 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -122595,7 +123128,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 761 + SolutionIndex: 764 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -122757,7 +123290,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 762 + SolutionIndex: 765 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -122919,7 +123452,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 763 + SolutionIndex: 766 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123077,7 +123610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 764 + SolutionIndex: 767 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123239,7 +123772,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 765 + SolutionIndex: 768 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123401,7 +123934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 766 + SolutionIndex: 769 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -123563,7 +124096,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 767 + SolutionIndex: 770 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -123725,7 +124258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 768 + SolutionIndex: 771 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_8_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -123887,7 +124420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 769 + SolutionIndex: 772 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124045,7 +124578,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 770 + SolutionIndex: 773 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA0_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124207,7 +124740,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 771 + SolutionIndex: 774 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124369,7 +124902,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 772 + SolutionIndex: 775 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -124531,7 +125064,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 773 + SolutionIndex: 776 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -124693,7 +125226,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 774 + SolutionIndex: 777 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x256x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -124855,7 +125388,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 775 + SolutionIndex: 778 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125017,7 +125550,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 776 + SolutionIndex: 779 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125179,7 +125712,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 777 + SolutionIndex: 780 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125341,7 +125874,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 778 + SolutionIndex: 781 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125503,7 +126036,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 779 + SolutionIndex: 782 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA0_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -125665,7 +126198,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 780 + SolutionIndex: 783 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA0_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125827,7 +126360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 781 + SolutionIndex: 784 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -125989,7 +126522,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 782 + SolutionIndex: 785 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126151,7 +126684,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 783 + SolutionIndex: 786 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126313,7 +126846,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 784 + SolutionIndex: 787 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126475,7 +127008,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 785 + SolutionIndex: 788 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126637,7 +127170,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 786 + SolutionIndex: 789 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126799,7 +127332,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 787 + SolutionIndex: 790 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -126957,7 +127490,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 788 + SolutionIndex: 791 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS0_GSU5_LPA2_LPB2_PGR0_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127119,7 +127652,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 789 + SolutionIndex: 792 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127281,7 +127814,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 790 + SolutionIndex: 793 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127443,7 +127976,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 791 + SolutionIndex: 794 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -127605,7 +128138,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 792 + SolutionIndex: 795 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -127767,7 +128300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 793 + SolutionIndex: 796 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -127929,7 +128462,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 794 + SolutionIndex: 797 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128091,7 +128624,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 795 + SolutionIndex: 798 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128253,7 +128786,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 796 + SolutionIndex: 799 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128415,7 +128948,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 797 + SolutionIndex: 800 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128577,7 +129110,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 798 + SolutionIndex: 801 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128739,7 +129272,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 799 + SolutionIndex: 802 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -128901,7 +129434,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 800 + SolutionIndex: 803 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129063,7 +129596,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 801 + SolutionIndex: 804 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129225,7 +129758,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 802 + SolutionIndex: 805 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT4_8_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129387,7 +129920,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 803 + SolutionIndex: 806 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA2_LPB2_PGR1_SNLL0_TT8_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129549,7 +130082,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 804 + SolutionIndex: 807 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129711,7 +130244,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 805 + SolutionIndex: 808 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -129873,7 +130406,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 806 + SolutionIndex: 809 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -130035,7 +130568,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 807 + SolutionIndex: 810 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA2_LPB2_PGR1_SNLL1_TT8_4_VW2_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130197,7 +130730,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 808 + SolutionIndex: 811 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU5_LPA2_LPB2_PGR1_SNLL0_TT4_4_VW2_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -130359,7 +130892,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 809 + SolutionIndex: 812 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130521,7 +131054,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 810 + SolutionIndex: 813 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -130683,7 +131216,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 811 + SolutionIndex: 814 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -130845,7 +131378,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 812 + SolutionIndex: 815 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131007,7 +131540,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 813 + SolutionIndex: 816 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131169,7 +131702,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 814 + SolutionIndex: 817 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131331,7 +131864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 815 + SolutionIndex: 818 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131493,7 +132026,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 816 + SolutionIndex: 819 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131651,7 +132184,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 817 + SolutionIndex: 820 SolutionNameMin: Cijk_Alik_Bljk_SB_MT256x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT8_4_VW4_WG32_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131813,7 +132346,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 818 + SolutionIndex: 821 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU3_LPA4_LPB4_PGR1_SNLL0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -131975,7 +132508,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 819 + SolutionIndex: 822 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -132137,7 +132670,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 820 + SolutionIndex: 823 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132299,7 +132832,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 821 + SolutionIndex: 824 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132461,7 +132994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 822 + SolutionIndex: 825 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132623,7 +133156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 823 + SolutionIndex: 826 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132785,7 +133318,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 824 + SolutionIndex: 827 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -132943,7 +133476,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 825 + SolutionIndex: 828 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS0_GSU1_LPA4_LPB4_PGR0_SNLL0_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133105,7 +133638,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 826 + SolutionIndex: 829 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133267,7 +133800,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 827 + SolutionIndex: 830 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133429,7 +133962,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 828 + SolutionIndex: 831 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -133591,7 +134124,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 829 + SolutionIndex: 832 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -133753,7 +134286,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 830 + SolutionIndex: 833 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -133915,7 +134448,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 831 + SolutionIndex: 834 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134077,7 +134610,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 832 + SolutionIndex: 835 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134239,7 +134772,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 833 + SolutionIndex: 836 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134401,7 +134934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 834 + SolutionIndex: 837 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134563,7 +135096,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 835 + SolutionIndex: 838 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT8_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134725,7 +135258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 836 + SolutionIndex: 839 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x32x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -134887,7 +135420,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 837 + SolutionIndex: 840 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG32_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -135049,7 +135582,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 838 + SolutionIndex: 841 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x16_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -135211,7 +135744,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 839 + SolutionIndex: 842 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135373,7 +135906,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 840 + SolutionIndex: 843 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135535,7 +136068,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 841 + SolutionIndex: 844 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x128x32_EPS1_GSU5_LPA4_LPB4_PGR1_SNLL0_TT4_4_VW4_WG8_32_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -135697,7 +136230,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 842 + SolutionIndex: 845 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -135859,7 +136392,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 843 + SolutionIndex: 846 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -136021,7 +136554,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 844 + SolutionIndex: 847 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -136183,7 +136716,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 845 + SolutionIndex: 848 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x32_EPS1_GSU1_LPA4_LPB4_PGR1_SNLL1_TT4_8_VW4_WG16_16_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -136347,7 +136880,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 846 + SolutionIndex: 849 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136511,7 +137044,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 847 + SolutionIndex: 850 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136675,7 +137208,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 848 + SolutionIndex: 851 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -136839,7 +137372,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 849 + SolutionIndex: 852 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137003,7 +137536,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 850 + SolutionIndex: 853 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137167,7 +137700,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 851 + SolutionIndex: 854 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -137331,7 +137864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 852 + SolutionIndex: 855 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -137495,7 +138028,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 853 + SolutionIndex: 856 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW2_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137659,7 +138192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 854 + SolutionIndex: 857 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x8x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_2_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137823,7 +138356,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 855 + SolutionIndex: 858 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT8_4_VW2_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -137987,7 +138520,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 856 + SolutionIndex: 859 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138151,7 +138684,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 857 + SolutionIndex: 860 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138315,7 +138848,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 858 + SolutionIndex: 861 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138479,7 +139012,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 859 + SolutionIndex: 862 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138639,7 +139172,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 860 + SolutionIndex: 863 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_GSU1_PGR0_SNLL0_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138803,7 +139336,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 861 + SolutionIndex: 864 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -138967,7 +139500,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 862 + SolutionIndex: 865 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_2_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139131,7 +139664,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 863 + SolutionIndex: 866 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139295,7 +139828,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 864 + SolutionIndex: 867 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU8_PGR1_SNLL0_TT4_4_VW4_WG4_4_8_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -139459,7 +139992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 865 + SolutionIndex: 868 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG16_16_1_WGM16 StaggerU: 32 StaggerUMapping: 0 @@ -139623,7 +140156,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 866 + SolutionIndex: 869 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG8_8_1_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -139787,7 +140320,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 867 + SolutionIndex: 870 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG2_8_4_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -139951,7 +140484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 868 + SolutionIndex: 871 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_GSU1_PGR1_SNLL1_TT8_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -140115,7 +140648,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 869 + SolutionIndex: 872 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x16x32_SE_EPS1_GSU1_PGR1_SNLL1_TT4_4_VW4_WG4_4_8_WGM64 StaggerU: 32 StaggerUMapping: 0 @@ -140280,7 +140813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 870 + SolutionIndex: 873 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -140447,7 +140980,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 871 + SolutionIndex: 874 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140612,7 +141145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 872 + SolutionIndex: 875 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140775,7 +141308,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 873 + SolutionIndex: 876 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -140940,7 +141473,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 874 + SolutionIndex: 877 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -141107,7 +141640,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 875 + SolutionIndex: 878 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141272,7 +141805,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 876 + SolutionIndex: 879 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141435,7 +141968,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 877 + SolutionIndex: 880 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141600,7 +142133,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 878 + SolutionIndex: 881 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -141767,7 +142300,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 879 + SolutionIndex: 882 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -141932,7 +142465,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 880 + SolutionIndex: 883 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142095,7 +142628,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 881 + SolutionIndex: 884 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO0_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142260,7 +142793,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 882 + SolutionIndex: 885 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_SNLL1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -142427,7 +142960,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 883 + SolutionIndex: 886 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142592,7 +143125,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 884 + SolutionIndex: 887 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142755,7 +143288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 885 + SolutionIndex: 888 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -142920,7 +143453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 886 + SolutionIndex: 889 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA0_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143085,7 +143618,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 887 + SolutionIndex: 890 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA0_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143252,7 +143785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 888 + SolutionIndex: 891 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143417,7 +143950,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 889 + SolutionIndex: 892 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143582,7 +144115,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 890 + SolutionIndex: 893 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143747,7 +144280,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 891 + SolutionIndex: 894 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -143912,7 +144445,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 892 + SolutionIndex: 895 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144075,7 +144608,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 893 + SolutionIndex: 896 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144240,7 +144773,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 894 + SolutionIndex: 897 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144407,7 +144940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 895 + SolutionIndex: 898 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144572,7 +145105,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 896 + SolutionIndex: 899 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144735,7 +145268,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 897 + SolutionIndex: 900 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -144900,7 +145433,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 898 + SolutionIndex: 901 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145067,7 +145600,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 899 + SolutionIndex: 902 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145232,7 +145765,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 900 + SolutionIndex: 903 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145395,7 +145928,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 901 + SolutionIndex: 904 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -145562,7 +146095,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 902 + SolutionIndex: 905 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -145725,7 +146258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 903 + SolutionIndex: 906 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_SNLL1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -145892,7 +146425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 904 + SolutionIndex: 907 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO0_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146055,7 +146588,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 905 + SolutionIndex: 908 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_SNLL1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146216,7 +146749,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 906 + SolutionIndex: 909 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB0_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146379,7 +146912,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 907 + SolutionIndex: 910 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146540,7 +147073,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 908 + SolutionIndex: 911 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW1_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -146701,7 +147234,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 909 + SolutionIndex: 912 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -146860,7 +147393,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 910 + SolutionIndex: 913 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147023,7 +147556,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 911 + SolutionIndex: 914 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147182,7 +147715,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 912 + SolutionIndex: 915 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147345,7 +147878,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 913 + SolutionIndex: 916 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147504,7 +148037,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 914 + SolutionIndex: 917 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -147667,7 +148200,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 915 + SolutionIndex: 918 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147826,7 +148359,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 916 + SolutionIndex: 919 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW1_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -147989,7 +148522,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 917 + SolutionIndex: 920 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148148,7 +148681,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 918 + SolutionIndex: 921 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL1_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148311,7 +148844,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 919 + SolutionIndex: 922 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT4_2_USFGRO1_VW1_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148472,7 +149005,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 920 + SolutionIndex: 923 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA1_LPB1_PGR0_PLR1_SNLL0_TT2_4_USFGRO1_VW1_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -148631,7 +149164,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 921 + SolutionIndex: 924 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR0_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -148794,7 +149327,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 922 + SolutionIndex: 925 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -148953,7 +149486,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 923 + SolutionIndex: 926 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -149114,7 +149647,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 924 + SolutionIndex: 927 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -149275,7 +149808,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 925 + SolutionIndex: 928 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_SNLL0_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149442,7 +149975,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 926 + SolutionIndex: 929 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT6_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149611,7 +150144,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 927 + SolutionIndex: 930 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149778,7 +150311,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 928 + SolutionIndex: 931 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -149943,7 +150476,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 929 + SolutionIndex: 932 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150110,7 +150643,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 930 + SolutionIndex: 933 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150280,7 +150813,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 931 + SolutionIndex: 934 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150448,7 +150981,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 932 + SolutionIndex: 935 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150612,7 +151145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 933 + SolutionIndex: 936 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR0_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -150776,7 +151309,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 934 + SolutionIndex: 937 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -150942,7 +151475,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 935 + SolutionIndex: 938 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151106,7 +151639,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 936 + SolutionIndex: 939 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -151274,7 +151807,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 937 + SolutionIndex: 940 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151444,7 +151977,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 938 + SolutionIndex: 941 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW1_LPA0_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151612,7 +152145,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 939 + SolutionIndex: 942 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151780,7 +152313,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 940 + SolutionIndex: 943 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -151948,7 +152481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 941 + SolutionIndex: 944 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152114,7 +152647,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 942 + SolutionIndex: 945 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x16_SE_EPS1_FL1_GRVW2_LPA0_LPB0_PGR1_PLR1_TT2_2_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152284,7 +152817,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 943 + SolutionIndex: 946 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152450,7 +152983,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 944 + SolutionIndex: 947 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT6_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -152620,7 +153153,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 945 + SolutionIndex: 948 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152788,7 +153321,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 946 + SolutionIndex: 949 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -152956,7 +153489,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 947 + SolutionIndex: 950 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153124,7 +153657,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 948 + SolutionIndex: 951 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153290,7 +153823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 949 + SolutionIndex: 952 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_LPA0_LPB0_PGR1_PLR1_TT8_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153460,7 +153993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 950 + SolutionIndex: 953 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -153626,7 +154159,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 951 + SolutionIndex: 954 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153794,7 +154327,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 952 + SolutionIndex: 955 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -153967,7 +154500,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 953 + SolutionIndex: 956 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154138,7 +154671,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 954 + SolutionIndex: 957 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154309,7 +154842,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 955 + SolutionIndex: 958 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154480,7 +155013,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 956 + SolutionIndex: 959 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154649,7 +155182,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 957 + SolutionIndex: 960 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -154820,7 +155353,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 958 + SolutionIndex: 961 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -154991,7 +155524,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 959 + SolutionIndex: 962 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155158,7 +155691,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 960 + SolutionIndex: 963 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_PGR0_PLR1_TT4_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155331,7 +155864,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 961 + SolutionIndex: 964 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155502,7 +156035,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 962 + SolutionIndex: 965 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x96x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT4_6_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -155673,7 +156206,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 963 + SolutionIndex: 966 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -155844,7 +156377,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 964 + SolutionIndex: 967 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156015,7 +156548,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 965 + SolutionIndex: 968 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156186,7 +156719,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 966 + SolutionIndex: 969 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156355,7 +156888,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 967 + SolutionIndex: 970 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -156526,7 +157059,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 968 + SolutionIndex: 971 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156697,7 +157230,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 969 + SolutionIndex: 972 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -156868,7 +157401,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 970 + SolutionIndex: 973 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157039,7 +157572,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 971 + SolutionIndex: 974 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157212,7 +157745,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 972 + SolutionIndex: 975 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -157383,7 +157916,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 973 + SolutionIndex: 976 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157554,7 +158087,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 974 + SolutionIndex: 977 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157725,7 +158258,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 975 + SolutionIndex: 978 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -157896,7 +158429,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 976 + SolutionIndex: 979 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158065,7 +158598,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 977 + SolutionIndex: 980 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158236,7 +158769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 978 + SolutionIndex: 981 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158407,7 +158940,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 979 + SolutionIndex: 982 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158578,7 +159111,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 980 + SolutionIndex: 983 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -158749,7 +159282,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 981 + SolutionIndex: 984 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -158920,7 +159453,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 982 + SolutionIndex: 985 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_4_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159091,7 +159624,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 983 + SolutionIndex: 986 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_4_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159264,7 +159797,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 984 + SolutionIndex: 987 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159435,7 +159968,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 985 + SolutionIndex: 988 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -159606,7 +160139,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 986 + SolutionIndex: 989 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159777,7 +160310,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 987 + SolutionIndex: 990 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -159948,7 +160481,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 988 + SolutionIndex: 991 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160119,7 +160652,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 989 + SolutionIndex: 992 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160290,7 +160823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 990 + SolutionIndex: 993 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160459,7 +160992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 991 + SolutionIndex: 994 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160630,7 +161163,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 992 + SolutionIndex: 995 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -160801,7 +161334,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 993 + SolutionIndex: 996 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -160972,7 +161505,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 994 + SolutionIndex: 997 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161143,7 +161676,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 995 + SolutionIndex: 998 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161314,7 +161847,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 996 + SolutionIndex: 999 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161485,7 +162018,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 997 + SolutionIndex: 1000 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR0_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -161656,7 +162189,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 998 + SolutionIndex: 1001 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -161827,7 +162360,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 999 + SolutionIndex: 1002 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT8_8_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162000,7 +162533,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1000 + SolutionIndex: 1003 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162171,7 +162704,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1001 + SolutionIndex: 1004 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162342,7 +162875,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1002 + SolutionIndex: 1005 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW2_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162514,7 +163047,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1003 + SolutionIndex: 1006 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -162686,7 +163219,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1004 + SolutionIndex: 1007 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -162858,7 +163391,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1005 + SolutionIndex: 1008 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163030,7 +163563,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1006 + SolutionIndex: 1009 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163202,7 +163735,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1007 + SolutionIndex: 1010 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163370,7 +163903,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1008 + SolutionIndex: 1011 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163542,7 +164075,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1009 + SolutionIndex: 1012 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -163714,7 +164247,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1010 + SolutionIndex: 1013 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -163884,7 +164417,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1011 + SolutionIndex: 1014 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164056,7 +164589,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1012 + SolutionIndex: 1015 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164224,7 +164757,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1013 + SolutionIndex: 1016 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS0_FL1_GRVW2_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164396,7 +164929,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1014 + SolutionIndex: 1017 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164568,7 +165101,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1015 + SolutionIndex: 1018 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -164740,7 +165273,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1016 + SolutionIndex: 1019 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -164908,7 +165441,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1017 + SolutionIndex: 1020 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -165080,7 +165613,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1018 + SolutionIndex: 1021 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165252,7 +165785,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1019 + SolutionIndex: 1022 SolutionNameMin: Cijk_Alik_Bljk_SB_MT96x64x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT6_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -165426,7 +165959,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1020 + SolutionIndex: 1023 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165598,7 +166131,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1021 + SolutionIndex: 1024 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165770,7 +166303,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1022 + SolutionIndex: 1025 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -165942,7 +166475,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1023 + SolutionIndex: 1026 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166112,7 +166645,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1024 + SolutionIndex: 1027 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166286,7 +166819,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1025 + SolutionIndex: 1028 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166458,7 +166991,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1026 + SolutionIndex: 1029 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166628,7 +167161,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1027 + SolutionIndex: 1030 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -166796,7 +167329,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1028 + SolutionIndex: 1031 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS0_FL1_GRVW4_GSU1_LPA0_LPB0_PGR0_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -166968,7 +167501,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1029 + SolutionIndex: 1032 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -167140,7 +167673,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1030 + SolutionIndex: 1033 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR0_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167312,7 +167845,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1031 + SolutionIndex: 1034 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT8_8_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167484,7 +168017,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1032 + SolutionIndex: 1035 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167658,7 +168191,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1033 + SolutionIndex: 1036 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -167830,7 +168363,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1034 + SolutionIndex: 1037 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168002,7 +168535,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1035 + SolutionIndex: 1038 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168174,7 +168707,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1036 + SolutionIndex: 1039 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168344,7 +168877,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1037 + SolutionIndex: 1040 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168516,7 +169049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1038 + SolutionIndex: 1041 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG8_16_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -168690,7 +169223,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1039 + SolutionIndex: 1042 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SE_EPS1_FL0_GRVW1_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -168862,7 +169395,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1040 + SolutionIndex: 1043 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169034,7 +169567,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1041 + SolutionIndex: 1044 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169206,7 +169739,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1042 + SolutionIndex: 1045 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169378,7 +169911,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1043 + SolutionIndex: 1046 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169548,7 +170081,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1044 + SolutionIndex: 1047 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -169720,7 +170253,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1045 + SolutionIndex: 1048 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x8_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -169892,7 +170425,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1046 + SolutionIndex: 1049 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170064,7 +170597,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1047 + SolutionIndex: 1050 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_8_2_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170236,7 +170769,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1048 + SolutionIndex: 1051 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170408,7 +170941,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1049 + SolutionIndex: 1052 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170580,7 +171113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1050 + SolutionIndex: 1053 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -170754,7 +171287,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1051 + SolutionIndex: 1054 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -170926,7 +171459,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1052 + SolutionIndex: 1055 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171096,7 +171629,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1053 + SolutionIndex: 1056 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171270,7 +171803,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1054 + SolutionIndex: 1057 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171442,7 +171975,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1055 + SolutionIndex: 1058 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171614,7 +172147,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1056 + SolutionIndex: 1059 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -171786,7 +172319,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1057 + SolutionIndex: 1060 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -171958,7 +172491,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1058 + SolutionIndex: 1061 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172130,7 +172663,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1059 + SolutionIndex: 1062 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172302,7 +172835,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1060 + SolutionIndex: 1063 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -172474,7 +173007,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1061 + SolutionIndex: 1064 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172644,7 +173177,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1062 + SolutionIndex: 1065 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x16_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -172818,7 +173351,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1063 + SolutionIndex: 1066 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -172990,7 +173523,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1064 + SolutionIndex: 1067 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173162,7 +173695,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1065 + SolutionIndex: 1068 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU8_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173334,7 +173867,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1066 + SolutionIndex: 1069 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -173506,7 +174039,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1067 + SolutionIndex: 1070 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x16_SE_EPS1_FL0_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173676,7 +174209,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1068 + SolutionIndex: 1071 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x8_SE_EPS1_FL1_GRVW4_GSU1_LPA0_LPB0_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -173850,7 +174383,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1069 + SolutionIndex: 1072 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174020,7 +174553,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1070 + SolutionIndex: 1073 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x8_SE_EPS1_FL1_GRVW2_GSU1_LPA4_LPB4_PGR1_PLR0_TT4_4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174192,7 +174725,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1071 + SolutionIndex: 1074 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174366,7 +174899,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1072 + SolutionIndex: 1075 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174536,7 +175069,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1073 + SolutionIndex: 1076 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -174710,7 +175243,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1074 + SolutionIndex: 1077 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -174882,7 +175415,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1075 + SolutionIndex: 1078 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW4_GSU1_LPA4_LPB0_PGR1_PLR1_TT4_8_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175054,7 +175587,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1076 + SolutionIndex: 1079 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT8_4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175226,7 +175759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1077 + SolutionIndex: 1080 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175398,7 +175931,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1078 + SolutionIndex: 1081 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA4_LPB4_PGR1_PLR1_TT4_4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -175570,7 +176103,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1079 + SolutionIndex: 1082 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175742,7 +176275,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1080 + SolutionIndex: 1083 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -175914,7 +176447,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1081 + SolutionIndex: 1084 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176086,7 +176619,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1082 + SolutionIndex: 1085 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176258,7 +176791,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1083 + SolutionIndex: 1086 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176430,7 +176963,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1084 + SolutionIndex: 1087 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176602,7 +177135,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1085 + SolutionIndex: 1088 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -176774,7 +177307,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1086 + SolutionIndex: 1089 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x32x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR0_TT4_2_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -176946,7 +177479,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1087 + SolutionIndex: 1090 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW1_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177118,7 +177651,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1088 + SolutionIndex: 1091 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB0_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177290,7 +177823,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1089 + SolutionIndex: 1092 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW2_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177462,7 +177995,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1090 + SolutionIndex: 1093 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177634,7 +178167,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1091 + SolutionIndex: 1094 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x8x32_SE_EPS1_FL0_GRVW4_GSU1_LPA2_LPB2_PGR1_PLR1_TT2_2_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -177806,7 +178339,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1092 + SolutionIndex: 1095 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL0_GRVW1_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -177976,7 +178509,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1093 + SolutionIndex: 1096 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x16x32_SE_EPS1_FL1_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT2_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178150,7 +178683,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1094 + SolutionIndex: 1097 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW2_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178322,7 +178855,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1095 + SolutionIndex: 1098 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL0_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178492,7 +179025,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1096 + SolutionIndex: 1099 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x16x32_SE_EPS1_FL1_GRVW4_GSU8_LPA2_LPB2_PGR1_PLR1_TT4_4_WG16_4_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -178666,7 +179199,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1097 + SolutionIndex: 1100 SolutionNameMin: Cijk_Alik_Bljk_SB_MT8x32x32_SE_EPS1_FL0_GRVW4_GSU4_LPA0_LPB2_PGR1_PLR1_TT2_2_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -178840,7 +179373,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1098 + SolutionIndex: 1101 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179014,7 +179547,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1099 + SolutionIndex: 1102 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -179188,7 +179721,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1100 + SolutionIndex: 1103 SolutionNameMin: Cijk_Alik_Bljk_SB_MT16x64x16_SE_EPS1_FL0_GRVW4_LPA0_LPB0_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG4_16_4_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179360,7 +179893,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1101 + SolutionIndex: 1104 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -179534,7 +180067,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1102 + SolutionIndex: 1105 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179710,7 +180243,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1103 + SolutionIndex: 1106 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -179880,7 +180413,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1104 + SolutionIndex: 1107 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180052,7 +180585,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1105 + SolutionIndex: 1108 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180226,7 +180759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1106 + SolutionIndex: 1109 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180402,7 +180935,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1107 + SolutionIndex: 1110 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180576,7 +181109,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1108 + SolutionIndex: 1111 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -180753,7 +181286,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1109 + SolutionIndex: 1112 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -180930,7 +181463,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1110 + SolutionIndex: 1113 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181105,7 +181638,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1111 + SolutionIndex: 1114 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181282,7 +181815,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1112 + SolutionIndex: 1115 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -181459,7 +181992,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1113 + SolutionIndex: 1116 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181638,7 +182171,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1114 + SolutionIndex: 1117 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181813,7 +182346,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1115 + SolutionIndex: 1118 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -181992,7 +182525,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1116 + SolutionIndex: 1119 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182167,7 +182700,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1117 + SolutionIndex: 1120 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -182344,7 +182877,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1118 + SolutionIndex: 1121 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182521,7 +183054,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1119 + SolutionIndex: 1122 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -182700,7 +183233,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1120 + SolutionIndex: 1123 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -182875,7 +183408,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1121 + SolutionIndex: 1124 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR0_TT4_8_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183052,7 +183585,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1122 + SolutionIndex: 1125 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183229,7 +183762,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1123 + SolutionIndex: 1126 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_8_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183404,7 +183937,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1124 + SolutionIndex: 1127 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW1_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183581,7 +184114,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1125 + SolutionIndex: 1128 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -183754,7 +184287,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1126 + SolutionIndex: 1129 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL0_GRVW4_LPA4_LPB4_PGR0_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -183929,7 +184462,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1127 + SolutionIndex: 1130 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x8_SE_EPS1_FL1_GRVW2_LPA4_LPB0_PGR1_PLR0_TT8_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184106,7 +184639,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1128 + SolutionIndex: 1131 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184283,7 +184816,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1129 + SolutionIndex: 1132 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT8_4_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184460,7 +184993,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1130 + SolutionIndex: 1133 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184639,7 +185172,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1131 + SolutionIndex: 1134 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -184812,7 +185345,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1132 + SolutionIndex: 1135 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR0_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -184985,7 +185518,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1133 + SolutionIndex: 1136 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185162,7 +185695,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1134 + SolutionIndex: 1137 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185337,7 +185870,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1135 + SolutionIndex: 1138 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185516,7 +186049,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1136 + SolutionIndex: 1139 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -185691,7 +186224,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1137 + SolutionIndex: 1140 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL1_GRVW2_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -185870,7 +186403,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1138 + SolutionIndex: 1141 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186045,7 +186578,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1139 + SolutionIndex: 1142 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186226,7 +186759,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1140 + SolutionIndex: 1143 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -186401,7 +186934,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1141 + SolutionIndex: 1144 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186580,7 +187113,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1142 + SolutionIndex: 1145 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -186755,7 +187288,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1143 + SolutionIndex: 1146 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL0_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -186934,7 +187467,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1144 + SolutionIndex: 1147 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187109,7 +187642,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1145 + SolutionIndex: 1148 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187282,7 +187815,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1146 + SolutionIndex: 1149 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187461,7 +187994,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1147 + SolutionIndex: 1150 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -187636,7 +188169,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1148 + SolutionIndex: 1151 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x64x16_SE_EPS0_FL1_GRVW2_LPA4_LPB4_PGR0_PLR1_TT4_4_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187817,7 +188350,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1149 + SolutionIndex: 1152 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -187996,7 +188529,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1150 + SolutionIndex: 1153 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x8_SE_EPS1_FL0_GRVW2_LPA0_LPB0_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -188173,7 +188706,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1151 + SolutionIndex: 1154 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -188352,7 +188885,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1152 + SolutionIndex: 1155 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x128x16_SE_EPS1_FL1_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_8_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188531,7 +189064,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1153 + SolutionIndex: 1156 SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x32_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT8_4_USFGRO1_VW4_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188712,7 +189245,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1154 + SolutionIndex: 1157 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -188891,7 +189424,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1155 + SolutionIndex: 1158 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -189066,7 +189599,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1156 + SolutionIndex: 1159 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189245,7 +189778,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1157 + SolutionIndex: 1160 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189424,7 +189957,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1158 + SolutionIndex: 1161 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -189599,7 +190132,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1159 + SolutionIndex: 1162 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189778,7 +190311,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1160 + SolutionIndex: 1163 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL0_GRVW4_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -189951,7 +190484,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1161 + SolutionIndex: 1164 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190126,7 +190659,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1162 + SolutionIndex: 1165 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -190305,7 +190838,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1163 + SolutionIndex: 1166 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190484,7 +191017,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1164 + SolutionIndex: 1167 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -190659,7 +191192,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1165 + SolutionIndex: 1168 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 @@ -190834,7 +191367,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1166 + SolutionIndex: 1169 SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x32_SE_EPS0_FL1_GRVW4_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191015,7 +191548,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1167 + SolutionIndex: 1170 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191190,7 +191723,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1168 + SolutionIndex: 1171 SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM8 StaggerU: 32 StaggerUMapping: 0 @@ -191313,7 +191846,186 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1172 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 4] + ThreadTile0: 2 + ThreadTile1: 4 + ThreadTileA: 2 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 8, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -191365,8 +192077,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1169 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW4_LPA2_LPB2_PGR0_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1173 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191409,7 +192121,7 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 GlobalLoadVectorWidthB: 4 GlobalRead2A: true @@ -191472,8 +192184,6 @@ NonTemporalC: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 @@ -191544,8 +192254,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1170 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1174 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191569,6 +192279,8 @@ WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 1 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191580,7 +192292,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -191588,16 +192300,16 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 1 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191608,22 +192320,22 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 - LVCB: 8 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 + LdsNumElements: 3680 + LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 LdsPadA: 2 LdsPadB: 2 LocalDotLayout: 1 @@ -191636,11 +192348,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 + LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 32 + MacroTile1: 64 MacroTileA: 32 - MacroTileB: 32 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -191649,12 +192361,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -191721,20 +192435,20 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1171 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL1_GRVW4_LPA2_LPB2_PGR1_PLR1_TT2_4_USFGRO1_VW2_WG16_8_1_WGM1 + SolutionIndex: 1175 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 16 - SubGroup1: 8 - SubGroupA: 16 - SubGroupB: 8 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [2, 4] - ThreadTile0: 2 + ThreadTile: [4, 4] + ThreadTile0: 4 ThreadTile1: 4 - ThreadTileA: 2 + ThreadTileA: 4 ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 @@ -191742,12 +192456,189 @@ VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [16, 8, 1] + WorkGroup: [8, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsNumElements: 3712 + LdsNumElementsAlignedA: 576 + LdsNumElementsAlignedB: 1088 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 576 + LdsOffsetB_Blk: 2624 + LdsPadA: 4 + LdsPadB: 4 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1176 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 - fractionalPerpOverhangA: 0 - fractionalPerpOverhangB: 0 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191780,7 +192671,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -191796,15 +192687,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3680 + LdsNumElements: 3712 LdsNumElementsAlignedA: 576 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 576 LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 2 + LdsPadA: 4 + LdsPadB: 4 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -191829,7 +192720,7 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 NumLoadsB: 8 NumLoadsCoalescedA: 1 @@ -191902,8 +192793,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1172 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM8 + SolutionIndex: 1177 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -191922,11 +192813,186 @@ Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 + VectorWidth: 4 WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 6] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: false + LdsNumElements: 3200 + LdsOffsetA: 0 + LdsOffsetB: 1088 + LdsPadA: 2 + LdsPadB: 2 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 64 + MacroTileA: 32 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1178 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 16 + SubGroupA: 8 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 4] + ThreadTile0: 4 + ThreadTile1: 4 + ThreadTileA: 4 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [8, 16, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -191947,15 +193013,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -191968,10 +193034,10 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 LVPA: 8 LVPB: 8 LdcEqualsLdd: false @@ -192009,12 +193075,12 @@ NonTemporalC: 0 NumElementsPerThread: 16 NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192081,8 +193147,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1173 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 1179 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192138,7 +193204,7 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] @@ -192154,15 +193220,15 @@ LVPA: 8 LVPB: 8 LdcEqualsLdd: false - LdsNumElements: 3712 + LdsNumElements: 3168 LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 1088 + LdsNumElementsAlignedB: 576 LdsOffsetA: 0 LdsOffsetA_Blk: 2048 LdsOffsetB: 576 LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192175,9 +193241,9 @@ LoopTail: true LoopUnroll: 16 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192186,14 +193252,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 4 - NumLoadsB: 8 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 + NumLoadsPerpendicularB: 4 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192260,8 +193326,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1174 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW1_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM8 + SolutionIndex: 1180 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192270,17 +193336,17 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B @@ -192305,15 +193371,15 @@ EdgeType: ShiftPtr ExpandPointerSwap: false FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192326,14 +193392,14 @@ KernelLanguage: Assembly LSCA: 32 LSCB: 32 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 LVPA: 4 LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3200 + LdsNumElements: 2176 LdsOffsetA: 0 LdsOffsetB: 1088 LdsPadA: 2 @@ -192350,9 +193416,9 @@ LoopTail: true LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192361,13 +193427,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 8 NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 8 NumThreads: 128 OptNoLoadLoop: 1 @@ -192435,8 +193501,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1175 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x32_SE_EPS0_FL0_GRVW2_LPA2_LPB2_PGR0_PLR1_TT4_4_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1181 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192445,11 +193511,11 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true @@ -192471,7 +193537,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 16 + DepthU: 32 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -192480,43 +193546,43 @@ EdgeType: ShiftPtr ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 2 - GlobalLoadVectorWidthB: 2 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 2 + GlobalReadVectorWidth: 4 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 2 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 16 - LSCB: 16 + LSCA: 32 + LSCB: 32 LSPA: 16 LSPB: 16 LVCA: 8 LVCB: 8 - LVPA: 8 - LVPB: 8 + LVPA: 4 + LVPB: 4 LdcEqualsLdd: false - LdsNumElements: 3712 - LdsNumElementsAlignedA: 576 + LdsNumElements: 6272 + LdsNumElementsAlignedA: 1088 LdsNumElementsAlignedB: 1088 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 4 - LdsPadB: 4 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1088 + LdsOffsetB_Blk: 5184 + LdsPadA: 2 + LdsPadB: 2 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192527,11 +193593,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 16 + LoopUnroll: 32 MacroTile0: 32 - MacroTile1: 64 + MacroTile1: 32 MacroTileA: 32 - MacroTileB: 64 + MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192540,14 +193606,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 16 + NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 4 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 128 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -192614,8 +193680,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1176 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x64x16_SE_EPS1_FL0_GRVW2_LPA4_LPB4_PGR1_PLR1_TT4_4_USFGRO1_VW4_WG8_16_1_WGM1 + SolutionIndex: 1182 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -192624,21 +193690,21 @@ SubGroupA: 8 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 4] + ThreadTile: [4, 2] ThreadTile0: 4 - ThreadTile1: 4 + ThreadTile1: 2 ThreadTileA: 4 - ThreadTileB: 4 + ThreadTileB: 2 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 2 WorkGroup: [8, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 1 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192657,17 +193723,17 @@ DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + ExpandPointerSwap: false + FractionalLoad: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192680,22 +193746,18 @@ KernelLanguage: Assembly LSCA: 16 LSCB: 16 - LSPA: 8 - LSPB: 8 - LVCA: 16 - LVCB: 16 - LVPA: 8 - LVPB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 3168 - LdsNumElementsAlignedA: 576 - LdsNumElementsAlignedB: 576 + LdsNumElements: 3072 LdsOffsetA: 0 - LdsOffsetA_Blk: 2048 - LdsOffsetB: 576 - LdsOffsetB_Blk: 2624 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192707,10 +193769,10 @@ LoopDoWhile: false LoopTail: true LoopUnroll: 16 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192719,15 +193781,13 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 4 - NumLoadsB: 4 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 128 + NumLoadsPerpendicularB: 2 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -192741,7 +193801,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192793,31 +193853,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1177 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x16_SE_EPS1_FL0_GRVW1_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM8 + SolutionIndex: 1183 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x64x16_SE_EPS0_FL1_GRVW2_GSU1_PGR0_PLR1_TT8_4_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 8 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 1 WorkGroupMappingType: B _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -192829,24 +193891,24 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: false + ExpandPointerSwap: true FractionalLoad: 0 - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 + GlobalReadVectorWidth: 2 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false @@ -192857,20 +193919,24 @@ InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 + LSCA: 16 + LSCB: 16 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 2176 + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 LdsOffsetA: 0 - LdsOffsetB: 1088 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true @@ -192881,11 +193947,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -192894,15 +193960,15 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 8 - NumLoadsB: 8 + NumElementsPerThread: 64 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -192916,7 +193982,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false + PrefetchGlobalRead: true PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -192968,31 +194034,31 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1178 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS0_FL0_GRVW1_LPA2_LPB2_PGR0_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1184 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT128x128x16_SE_EPS1_FL0_GRVW2_GSU1_PGR1_PLR1_TT8_8_VW2_WG16_16_1_WGM1 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 + SubGroup0: 16 SubGroup1: 16 - SubGroupA: 8 + SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 2] - ThreadTile0: 4 - ThreadTile1: 2 - ThreadTileA: 4 - ThreadTileB: 2 + ThreadTile: [8, 8] + ThreadTile0: 8 + ThreadTile1: 8 + ThreadTileA: 8 + ThreadTileB: 8 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true VectorWidth: 2 - WorkGroup: [8, 16, 1] + WorkGroup: [16, 16, 1] WorkGroupMapping: 1 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -193004,7 +194070,7 @@ BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -193012,9 +194078,9 @@ DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true - FractionalLoad: 0 + FractionalLoad: 1 GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + GlobalLoadVectorWidthB: 2 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true @@ -193025,45 +194091,45 @@ GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 2 + GlobalWriteVectorWidth: 4 GuaranteeNoPartialA: true GuaranteeNoPartialB: true ISA: [9, 0, 6] InnerUnroll: 1 InterleaveAlpha: 0 KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 16 - LSPB: 16 - LVCA: 8 + LSCA: 16 + LSCB: 16 + LSPA: 64 + LSPB: 32 + LVCA: 4 LVCB: 8 - LVPA: 4 - LVPB: 4 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: false - LdsNumElements: 6272 - LdsNumElementsAlignedA: 1088 - LdsNumElementsAlignedB: 1088 + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 512 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1088 - LdsOffsetB_Blk: 5184 - LdsPadA: 2 - LdsPadB: 2 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 LocalDotLayout: 1 LocalRead2A: true LocalRead2B: true - LocalSplitU: 1 + LocalSplitU: 2 LocalWrite2A: true LocalWrite2B: true LocalWriteUseSgprA: false LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 32 + LoopUnroll: 8 + MacroTile0: 64 MacroTile1: 32 - MacroTileA: 32 + MacroTileA: 64 MacroTileB: 32 MacroTileShapeMax: 64 MacroTileShapeMin: 1 @@ -193074,14 +194140,12 @@ NonTemporalB: 0 NonTemporalC: 0 NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 4 - NumLoadsA: 2 - NumLoadsB: 2 + NumGlobalWriteVectorsPerThread: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 128 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 PackFreeDims: 1 @@ -193147,31 +194211,33 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 1179 - SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x32_SE_EPS1_FL0_GRVW4_LPA2_LPB2_PGR1_PLR1_TT4_2_USFGRO1_VW2_WG8_16_1_WGM1 + SolutionIndex: 1185 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT64x32x16_SE_EPS1_FL1_GRVW4_GSU1_PGR1_PLR1_TT4_4_VW4_WG16_8_2_WGM8 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 + SubGroup0: 16 + SubGroup1: 8 + SubGroupA: 16 + SubGroupB: 8 SuppressNoLoadLoop: false - ThreadTile: [4, 2] + ThreadTile: [4, 4] ThreadTile0: 4 - ThreadTile1: 2 + ThreadTile1: 4 ThreadTileA: 4 - ThreadTileB: 2 + ThreadTileB: 4 UnrollMemFence: false UseSgprForGRO: 1 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 2 - WorkGroup: [8, 16, 1] - WorkGroupMapping: 1 + VectorWidth: 4 + WorkGroup: [16, 8, 2] + WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 1 + _staggerStrideShift: 2 + fractionalPerpOverhangA: 0 + fractionalPerpOverhangB: 0 - [2, 3, 0, 1] - - - [1024, 128, 1, 128] - [12, 896.219] @@ -199317,6 +200383,8 @@ - [556, 7802.45] - - [256, 224, 9, 9792] - [571, 7100.97] + - - [128, 128, 11, 3264] + - [577, 4828.06] - - [256, 256, 9, 4896] - [569, 6163.1] - - [320, 256, 9, 4896] @@ -199337,6 +200405,8 @@ - [565, 4940.68] - - [128, 128, 9, 9792] - [587, 4094.51] + - - [128, 128, 11, 6528] + - [587, 4780.97] - - [192, 192, 11, 6528] - [552, 6918.07] - - [160, 160, 9, 4896] @@ -199347,6 +200417,8 @@ - [558, 7526.25] - - [224, 192, 11, 6528] - [582, 7333.58] + - - [192, 192, 9, 19584] + - [583, 5859.95] - - [256, 224, 11, 13056] - [556, 6512.15] - - [224, 192, 11, 13056] @@ -199367,6 +200439,8 @@ - [580, 1382.91] - - [224, 192, 11, 3264] - [583, 7336.37] + - - [128, 128, 9, 19584] + - [550, 3631.15] - - [224, 224, 11, 6528] - [572, 5718.39] - - [160, 160, 11, 13056] @@ -199387,6 +200461,8 @@ - [551, 7770.2] - - [320, 256, 11, 13056] - [561, 8806.16] + - - [64, 64, 9, 345728] + - [589, 1386.57] - - [128, 128, 9, 4896] - [587, 4041.34] - - [256, 256, 9, 9792] @@ -199395,6 +200471,8 @@ - [564, 6936.98] - - [320, 256, 11, 3264] - [560, 8630.45] + - - [256, 256, 11, 6528] + - [552, 7354.98] - - [224, 192, 9, 4896] - [584, 6747.03] - - [256, 224, 9, 19584] @@ -199409,6 +200487,8 @@ - [573, 5133.73] - - [256, 224, 11, 6528] - [566, 6509.68] + - - [128, 128, 11, 13056] + - [557, 4411.67] - - [192, 160, 9, 4896] - [586, 5118.14] - - [256, 224, 11, 3264] @@ -199419,6 +200499,8 @@ - [567, 6185.35] - - [256, 256, 9, 19584] - [556, 6147.61] + - - [192, 128, 11, 13056] + - [567, 5112.27] - - [224, 192, 9, 9792] - [554, 6657.91] - - [160, 160, 11, 6528] @@ -199427,6258 +200509,6286 @@ - [566, 7023.59] - - [192, 128, 9, 9792] - [557, 5400.54] + - - [1024, 6400, 1, 65] + - [590, 5298.31] + - - [4096, 6400, 1, 256] + - [591, 9150.88] + - - [4096, 64, 1, 1024] + - [592, 5482.75] - - [1024, 128, 1, 128] - - [602, 896.219] + - [605, 896.219] - - [4, 704, 1, 1280] - - [639, 328.876] + - [642, 328.876] - - [4, 1856, 1, 3328] - - [649, 501.361] + - [652, 501.361] - - [1856, 448, 1, 3328] - - [694, 5677.91] + - [697, 5677.91] - - [2944, 4288, 1, 1280] - - [680, 8412.39] + - [683, 8412.39] - - [2368, 64, 1, 3328] - - [630, 4913.92] + - [633, 4913.92] - - [1760, 32, 1, 1760] - - [657, 3312.94] + - [660, 3312.94] - - [2368, 5888, 1, 256] - - [680, 6489.72] + - [683, 6489.72] - - [5888, 1856, 1, 256] - - [692, 7791.88] + - [695, 7791.88] - - [128, 64, 1, 256] - - [664, 369.217] + - [667, 369.217] - - [512, 24000, 1, 1536] - - [686, 8827.37] + - [689, 8827.37] - - [128, 6784, 1, 3328] - - [686, 6536.99] + - [689, 6536.99] - - [5888, 1408, 1, 256] - - [700, 6129.61] + - [703, 6129.61] - - [5888, 1856, 1, 3328] - - [686, 7969.17] + - [689, 7969.17] - - [5056, 704, 1, 256] - - [686, 6723.82] + - [689, 6723.82] - - [2048, 400, 1, 512] - - [692, 4531.44] + - [695, 4531.44] - - [5888, 2944, 1, 3328] - - [692, 8608.04] + - [695, 8608.04] - - [1856, 4288, 1, 256] - - [692, 6297.54] + - [695, 6297.54] - - [1024, 5056, 1, 128] - - [670, 3595.37] + - [673, 3595.37] - - [5056, 5056, 1, 3328] - - [686, 8559.16] + - [689, 8559.16] - - [1408, 5888, 1, 1280] - - [681, 6797.06] + - [684, 6797.06] - - [2368, 448, 1, 128] - - [670, 2814.9] + - [673, 2814.9] - - [2368, 6784, 1, 128] - - [674, 4781.98] + - [677, 4781.98] - - [1024, 3584, 1, 3328] - - [682, 8402.44] + - [685, 8402.44] - - [512, 48000, 1, 2048] - - [686, 8162.23] + - [689, 8162.23] - - [128, 448, 1, 1280] - - [657, 2903.49] + - [660, 2903.49] - - [256, 4288, 1, 3328] - - [687, 6345.94] + - [690, 6345.94] - - [5888, 1408, 1, 1280] - - [686, 8959.45] + - [689, 8959.45] - - [704, 1856, 1, 3328] - - [681, 6955.27] + - [684, 6955.27] - - [4, 1408, 1, 128] - - [701, 60.0747] + - [704, 60.0747] - - [1024, 2368, 1, 256] - - [688, 5927.78] + - [691, 5927.78] - - [64, 4, 1, 256] - - [706, 13.2129] + - [709, 13.2129] - - [1408, 1856, 1, 1280] - - [684, 8051.58] + - [687, 8051.58] - - [1408, 64, 1, 1280] - - [660, 3400.45] + - [663, 3400.45] - - [448, 1024, 1, 1280] - - [688, 5729.92] + - [691, 5729.92] - - [6144, 24000, 1, 2048] - - [692, 7738.3] + - [695, 7738.3] - - [4096, 32, 1, 4096] - - [630, 2381.43] + - [633, 2381.43] - - [256, 1408, 1, 3328] - - [688, 4844.78] + - [691, 4844.78] - - [5056, 5056, 1, 1280] - - [692, 9090.1] + - [695, 9090.1] - - [448, 5056, 1, 256] - - [698, 4961.18] + - [701, 4961.18] - - [704, 1856, 1, 1280] - - [684, 6456.44] + - [687, 6456.44] - - [128, 5056, 1, 128] - - [613, 2251.02] + - [616, 2251.02] - - [2368, 128, 1, 256] - - [681, 3403.27] + - [684, 3403.27] - - [1760, 6400, 1, 1760] - - [680, 8959.7] + - [683, 8959.7] - - [1856, 1408, 1, 128] - - [673, 3493.06] + - [676, 3493.06] - - [64, 5056, 1, 256] - - [682, 2582.22] + - [685, 2582.22] - - [6784, 256, 1, 3328] - - [680, 7323.54] + - [683, 7323.54] - - [6784, 4288, 1, 3328] - - [682, 8542.09] + - [685, 8542.09] - - [4288, 448, 1, 256] - - [698, 5030.5] + - [701, 5030.5] - - [64, 704, 1, 128] - - [615, 375.467] + - [618, 375.467] - - [1856, 2368, 1, 3328] - - [691, 6742.34] + - [694, 6742.34] - - [4288, 2944, 1, 1280] - - [692, 8578.17] + - [695, 8578.17] - - [704, 5056, 1, 1280] - - [688, 8014.45] + - [691, 8014.45] - - [2368, 704, 1, 3328] - - [687, 6544.31] + - [690, 6544.31] - - [256, 5888, 1, 256] - - [685, 5932.9] + - [688, 5932.9] - - [1856, 4288, 1, 3328] - - [691, 7410.72] + - [694, 7410.72] - - [256, 2944, 1, 256] - - [687, 5013.98] + - [690, 5013.98] - - [5888, 1024, 1, 256] - - [692, 8069.34] + - [695, 8069.34] - - [448, 64, 1, 1280] - - [667, 2057.18] + - [670, 2057.18] - - [3072, 64, 1, 1024] - - [647, 2145.42] + - [650, 2145.42] - - [3584, 4, 1, 1280] - - [639, 498.643] + - [642, 498.643] - - [16384, 3200, 1, 4096] - - [679, 6621.43] + - [682, 6621.43] - - [2944, 64, 1, 256] - - [687, 2554.79] + - [690, 2554.79] - - [128, 4, 1, 1280] - - [649, 87.1489] + - [652, 87.1489] - - [1408, 2944, 1, 256] - - [686, 8029.35] + - [689, 8029.35] - - [256, 1856, 1, 1280] - - [681, 6170.6] + - [684, 6170.6] - - [6784, 5056, 1, 3328] - - [690, 7134.19] + - [693, 7134.19] - - [5056, 5056, 1, 256] - - [698, 6246.8] + - [701, 6246.8] - - [1408, 6784, 1, 128] - - [675, 4329.45] + - [678, 4329.45] - - [64, 1024, 1, 1280] - - [657, 3206.65] + - [660, 3206.65] - - [2944, 4, 1, 256] - - [706, 333.48] + - [709, 333.48] - - [704, 5056, 1, 128] - - [670, 4085.42] + - [673, 4085.42] - - [4, 2368, 1, 1280] - - [707, 394.667] + - [710, 394.667] - - [2368, 2944, 1, 1280] - - [686, 8633.95] + - [689, 8633.95] - - [128, 3584, 1, 1280] - - [687, 6046.15] + - [690, 6046.15] - - [6784, 6784, 1, 1280] - - [692, 8847.41] + - [695, 8847.41] - - [1408, 4288, 1, 1280] - - [692, 8236.69] + - [695, 8236.69] - - [3584, 4288, 1, 1280] - - [687, 7399.88] + - [690, 7399.88] - - [2368, 704, 1, 1280] - - [680, 6754.4] + - [683, 6754.4] - - [5056, 4288, 1, 3328] - - [686, 8569.53] + - [689, 8569.53] - - [3584, 2368, 1, 3328] - - [691, 7942.38] + - [694, 7942.38] - - [64, 704, 1, 1280] - - [660, 2363.59] + - [663, 2363.59] - - [4288, 256, 1, 256] - - [688, 4591.8] + - [691, 4591.8] - - [2944, 128, 1, 128] - - [613, 1878.29] + - [616, 1878.29] - - [6144, 32, 1, 2560] - - [658, 3334.1] + - [661, 3334.1] - - [6784, 448, 1, 1280] - - [690, 7939.2] + - [693, 7939.2] - - [1408, 2944, 1, 128] - - [674, 4096.51] + - [677, 4096.51] - - [4288, 2944, 1, 256] - - [680, 8141.13] + - [683, 8141.13] - - [5888, 704, 1, 1280] - - [681, 7516.13] + - [684, 7516.13] - - [5056, 4, 1, 3328] - - [624, 552.409] + - [627, 552.409] - - [1856, 64, 1, 1280] - - [630, 3870.76] + - [633, 3870.76] - - [1760, 16, 1, 1760] - - [642, 2181.41] + - [645, 2181.41] - - [448, 5888, 1, 128] - - [675, 3371.0] + - [678, 3371.0] - - [5888, 64, 1, 3328] - - [655, 5319.38] + - [658, 5319.38] - - [2944, 256, 1, 3328] - - [687, 7122.3] + - [690, 7122.3] - - [1024, 64, 1, 128] - - [602, 595.782] + - [605, 595.782] - - [5056, 2368, 1, 1280] - - [681, 7778.19] + - [684, 7778.19] - - [448, 3584, 1, 1280] - - [686, 6500.52] + - [689, 6500.52] - - [6784, 5888, 1, 256] - - [686, 8918.58] + - [689, 8918.58] - - [704, 1024, 1, 128] - - [670, 2627.41] + - [673, 2627.41] - - [704, 128, 1, 1280] - - [657, 3408.49] + - [660, 3408.49] - - [4, 3584, 1, 128] - - [701, 140.721] + - [704, 140.721] - - [1408, 448, 1, 1280] - - [681, 5881.44] + - [684, 5881.44] - - [1024, 1408, 1, 256] - - [685, 5647.17] + - [688, 5647.17] - - [2368, 2368, 1, 3328] - - [679, 7688.73] + - [682, 7688.73] - - [1856, 6784, 1, 128] - - [670, 4705.85] + - [673, 4705.85] - - [5056, 704, 1, 3328] - - [690, 8198.88] + - [693, 8198.88] - - [1408, 1856, 1, 256] - - [692, 6339.95] + - [695, 6339.95] - - [1408, 704, 1, 3328] - - [684, 7599.55] + - [687, 7599.55] - - [2368, 5056, 1, 256] - - [692, 8242.75] + - [695, 8242.75] - - [1408, 256, 1, 1280] - - [687, 4879.16] + - [690, 4879.16] - - [3072, 128, 1, 1024] - - [656, 2525.42] + - [659, 2525.42] - - [3584, 2368, 1, 1280] - - [688, 8132.62] + - [691, 8132.62] - - [4288, 64, 1, 3328] - - [643, 5156.43] + - [646, 5156.43] - - [2368, 4, 1, 1280] - - [705, 482.65] + - [708, 482.65] - - [704, 5888, 1, 256] - - [695, 5398.65] + - [698, 5398.65] - - [6784, 2944, 1, 128] - - [671, 4748.89] + - [674, 4748.89] - - [2560, 1600, 1, 2560] - - [682, 7354.9] + - [685, 7354.9] - - [4288, 6784, 1, 3328] - - [679, 7409.31] + - [682, 7409.31] - - [2944, 256, 1, 256] - - [687, 5077.32] + - [690, 5077.32] - - [2944, 6784, 1, 3328] - - [692, 8067.95] + - [695, 8067.95] - - [704, 1408, 1, 3328] - - [687, 7239.33] + - [690, 7239.33] - - [6144, 5984, 1, 2048] - - [686, 7175.97] + - [689, 7175.97] - - [3584, 704, 1, 3328] - - [692, 6642.76] + - [695, 6642.76] - - [2944, 256, 1, 128] - - [671, 2644.44] + - [674, 2644.44] - - [6784, 4, 1, 1280] - - [703, 402.387] + - [706, 402.387] - - [1024, 64, 1, 1280] - - [657, 2601.93] + - [660, 2601.93] - - [2048, 1600, 1, 512] - - [684, 5592.4] + - [687, 5592.4] - - [448, 4288, 1, 256] - - [682, 6128.89] + - [685, 6128.89] - - [64, 3584, 1, 3328] - - [623, 5534.83] + - [626, 5534.83] - - [1856, 4288, 1, 128] - - [673, 4400.01] + - [676, 4400.01] - - [704, 2368, 1, 1280] - - [698, 5734.92] + - [701, 5734.92] - - [1856, 2368, 1, 1280] - - [695, 6482.3] + - [698, 6482.3] - - [2368, 128, 1, 3328] - - [668, 4717.22] + - [671, 4717.22] - - [2944, 128, 1, 256] - - [695, 3276.8] + - [698, 3276.8] - - [448, 1408, 1, 256] - - [687, 4852.18] + - [690, 4852.18] - - [1856, 4288, 1, 1280] - - [682, 8132.86] + - [685, 8132.86] - - [64, 5056, 1, 3328] - - [658, 5096.96] + - [661, 5096.96] - - [4, 704, 1, 256] - - [705, 128.731] + - [708, 128.731] - - [1024, 448, 1, 128] - - [670, 1816.84] + - [673, 1816.84] - - [704, 4, 1, 1280] - - [706, 328.876] + - [709, 328.876] - - [704, 256, 1, 128] - - [674, 876.469] + - [677, 876.469] - - [704, 2944, 1, 128] - - [674, 3734.37] + - [677, 3734.37] - - [1408, 1024, 1, 1280] - - [682, 7224.75] + - [685, 7224.75] - - [704, 6784, 1, 256] - - [686, 7354.67] + - [689, 7354.67] - - [6784, 704, 1, 256] - - [682, 6012.18] + - [685, 6012.18] - - [5056, 1408, 1, 128] - - [675, 4311.18] + - [678, 4311.18] - - [2048, 7000, 1, 2048] - - [686, 7231.97] + - [689, 7231.97] - - [256, 3584, 1, 3328] - - [690, 7005.9] + - [693, 7005.9] - - [4, 5888, 1, 3328] - - [708, 534.512] + - [711, 534.512] - - [128, 1408, 1, 128] - - [600, 1176.97] + - [603, 1176.97] - - [3584, 4288, 1, 3328] - - [692, 7134.9] + - [695, 7134.9] - - [5888, 1856, 1, 1280] - - [680, 8394.93] + - [683, 8394.93] - - [256, 1408, 1, 256] - - [681, 3977.36] + - [684, 3977.36] - - [5056, 64, 1, 1280] - - [681, 4257.68] + - [684, 4257.68] - - [1024, 704, 1, 256] - - [681, 5036.83] + - [684, 5036.83] - - [448, 128, 1, 128] - - [602, 533.433] + - [605, 533.433] - - [2368, 3584, 1, 1280] - - [686, 8272.33] + - [689, 8272.33] - - [2368, 6784, 1, 1280] - - [679, 8288.14] + - [682, 8288.14] - - [1856, 4, 1, 1280] - - [619, 464.0] + - [622, 464.0] - - [448, 448, 1, 256] - - [681, 3058.35] + - [684, 3058.35] - - [2944, 3584, 1, 3328] - - [686, 8557.53] + - [689, 8557.53] - - [7680, 32, 1, 2560] - - [658, 3728.93] + - [661, 3728.93] - - [128, 4288, 1, 128] - - [601, 2116.1] + - [604, 2116.1] - - [256, 256, 1, 3328] - - [657, 4050.96] + - [660, 4050.96] - - [128, 1024, 1, 3328] - - [630, 5139.11] + - [633, 5139.11] - - [4, 1408, 1, 3328] - - [649, 502.771] + - [652, 502.771] - - [6784, 2944, 1, 256] - - [680, 8445.96] + - [683, 8445.96] - - [64, 1856, 1, 1280] - - [622, 3870.76] + - [625, 3870.76] - - [6784, 64, 1, 128] - - [670, 1877.52] + - [673, 1877.52] - - [4288, 2368, 1, 3328] - - [690, 8419.3] + - [693, 8419.3] - - [1856, 2368, 1, 256] - - [684, 6887.38] + - [687, 6887.38] - - [3584, 256, 1, 128] - - [674, 2496.61] + - [677, 2496.61] - - [3584, 6784, 1, 3328] - - [686, 7626.08] + - [689, 7626.08] - - [256, 1024, 1, 256] - - [687, 3095.43] + - [690, 3095.43] - - [4, 6784, 1, 3328] - - [649, 589.174] + - [652, 589.174] - - [1024, 5888, 1, 3328] - - [686, 7794.25] + - [689, 7794.25] - - [1024, 128, 1, 1280] - - [659, 3130.08] + - [662, 3130.08] - - [3072, 32, 1, 1024] - - [646, 1675.49] + - [649, 1675.49] - - [6144, 24000, 1, 2560] - - [686, 7256.04] + - [689, 7256.04] - - [5056, 4288, 1, 1280] - - [684, 8348.93] + - [687, 8348.93] - - [5888, 64, 1, 256] - - [633, 2593.25] + - [636, 2593.25] - - [6784, 1856, 1, 3328] - - [680, 8087.28] + - [683, 8087.28] - - [1408, 5056, 1, 1280] - - [682, 7802.53] + - [685, 7802.53] - - [1856, 256, 1, 1280] - - [687, 6150.63] + - [690, 6150.63] - - [64, 5888, 1, 3328] - - [654, 5301.39] + - [657, 5301.39] - - [2368, 2368, 1, 1280] - - [684, 8233.33] + - [687, 8233.33] - - [2944, 5888, 1, 128] - - [677, 3745.41] + - [680, 3745.41] - - [704, 5888, 1, 1280] - - [682, 8244.94] + - [685, 8244.94] - - [2368, 3584, 1, 128] - - [674, 4523.33] + - [677, 4523.33] - - [1856, 5056, 1, 128] - - [671, 4497.98] + - [674, 4497.98] - - [704, 1024, 1, 1280] - - [695, 5479.49] + - [698, 5479.49] - - [448, 256, 1, 3328] - - [638, 5048.7] + - [641, 5048.7] - - [448, 1856, 1, 128] - - [671, 2936.82] + - [674, 2936.82] - - [8192, 3200, 1, 2048] - - [680, 6713.02] + - [683, 6713.02] - - [128, 1024, 1, 128] - - [616, 998.644] + - [619, 998.644] - - [2944, 4, 1, 128] - - [701, 98.6471] + - [704, 98.6471] - - [1024, 704, 1, 1280] - - [687, 5896.9] + - [690, 5896.9] - - [128, 5888, 1, 256] - - [687, 5013.98] + - [690, 5013.98] - - [1024, 5056, 1, 1280] - - [686, 8857.71] + - [689, 8857.71] - - [4288, 1024, 1, 256] - - [692, 6195.29] + - [695, 6195.29] - - [2944, 2368, 1, 128] - - [670, 4442.13] + - [673, 4442.13] - - [704, 704, 1, 3328] - - [687, 6764.3] + - [690, 6764.3] - - [704, 1408, 1, 1280] - - [688, 7383.48] + - [691, 7383.48] - - [5888, 448, 1, 1280] - - [686, 7299.39] + - [689, 7299.39] - - [3584, 256, 1, 3328] - - [684, 7061.62] + - [687, 7061.62] - - [704, 5888, 1, 3328] - - [688, 8142.32] + - [691, 8142.32] - - [704, 1856, 1, 128] - - [674, 3139.04] + - [677, 3139.04] - - [448, 448, 1, 3328] - - [652, 5063.24] + - [655, 5063.24] - - [4, 4288, 1, 128] - - [702, 64.8775] + - [705, 64.8775] - - [128, 704, 1, 1280] - - [622, 3400.45] + - [625, 3400.45] - - [3584, 2944, 1, 256] - - [692, 7982.04] + - [695, 7982.04] - - [3584, 4, 1, 128] - - [701, 105.218] + - [704, 105.218] - - [1856, 128, 1, 3328] - - [653, 5442.09] + - [656, 5442.09] - - [4, 64, 1, 1280] - - [707, 42.2268] + - [710, 42.2268] - - [2944, 448, 1, 128] - - [670, 2926.85] + - [673, 2926.85] - - [128, 2944, 1, 1280] - - [681, 5109.59] + - [684, 5109.59] - - [64, 64, 1, 3328] - - [649, 1252.89] + - [652, 1252.89] - - [448, 2944, 1, 1280] - - [690, 6684.37] + - [693, 6684.37] - - [512, 24000, 1, 2048] - - [686, 7938.93] + - [689, 7938.93] - - [128, 256, 1, 3328] - - [667, 3276.8] + - [670, 3276.8] - - [1408, 5056, 1, 3328] - - [692, 8959.11] + - [695, 8959.11] - - [1856, 1856, 1, 3328] - - [682, 8006.07] + - [685, 8006.07] - - [3584, 128, 1, 256] - - [687, 4292.42] + - [690, 4292.42] - - [2560, 800, 1, 2560] - - [682, 6262.38] + - [685, 6262.38] - - [448, 1408, 1, 3328] - - [698, 4997.25] + - [701, 4997.25] - - [2368, 2368, 1, 256] - - [700, 4978.84] + - [703, 4978.84] - - [4288, 4288, 1, 1280] - - [679, 8617.68] + - [682, 8617.68] - - [64, 448, 1, 1280] - - [625, 2057.18] + - [628, 2057.18] - - [5888, 1024, 1, 1280] - - [697, 6848.07] + - [700, 6848.07] - - [1408, 4288, 1, 256] - - [680, 7076.91] + - [683, 7076.91] - - [448, 4, 1, 256] - - [705, 84.3294] + - [708, 84.3294] - - [5888, 448, 1, 128] - - [674, 3493.81] + - [677, 3493.81] - - [512, 48000, 1, 2560] - - [692, 8960.03] + - [695, 8960.03] - - [35, 8457, 1, 1760] - - [594, 3934.68] + - [597, 3934.68] - - [704, 6784, 1, 3328] - - [679, 8180.78] + - [682, 8180.78] - - [2560, 6400, 1, 2560] - - [680, 7822.14] + - [683, 7822.14] - - [5056, 1024, 1, 1280] - - [682, 8357.28] + - [685, 8357.28] - - [448, 5888, 1, 3328] - - [686, 7505.18] + - [689, 7505.18] - - [128, 4, 1, 128] - - [701, 0.562251] + - [704, 0.562251] - - [1024, 2944, 1, 1280] - - [686, 8406.14] + - [689, 8406.14] - - [5056, 5888, 1, 1280] - - [686, 8819.66] + - [689, 8819.66] - - [4288, 5888, 1, 128] - - [671, 3522.22] + - [674, 3522.22] - - [256, 3584, 1, 256] - - [682, 5883.79] + - [685, 5883.79] - - [1408, 3584, 1, 128] - - [670, 4283.31] + - [673, 4283.31] - - [256, 2944, 1, 3328] - - [690, 5670.53] + - [693, 5670.53] - - [448, 3584, 1, 128] - - [674, 3171.62] + - [677, 3171.62] - - [5888, 2944, 1, 1280] - - [692, 8198.76] + - [695, 8198.76] - - [4, 6784, 1, 1280] - - [639, 553.796] + - [642, 553.796] - - [2368, 5888, 1, 128] - - [670, 4787.22] + - [673, 4787.22] - - [8448, 16, 1, 2816] - - [629, 2452.53] + - [632, 2452.53] - - [64, 2944, 1, 128] - - [602, 1376.56] + - [605, 1376.56] - - [2368, 4, 1, 256] - - [624, 278.077] + - [627, 278.077] - - [3584, 5888, 1, 256] - - [700, 6233.56] + - [703, 6233.56] - - [2368, 1024, 1, 128] - - [671, 3781.41] + - [674, 3781.41] - - [2368, 704, 1, 128] - - [671, 3198.22] + - [674, 3198.22] - - [3584, 2944, 1, 1280] - - [682, 8045.58] + - [685, 8045.58] - - [3584, 2368, 1, 128] - - [671, 4188.47] + - [674, 4188.47] - - [5056, 704, 1, 128] - - [674, 4019.11] + - [677, 4019.11] - - [448, 2368, 1, 128] - - [676, 2522.11] + - [679, 2522.11] - - [5056, 1408, 1, 3328] - - [684, 8349.83] + - [687, 8349.83] - - [1408, 704, 1, 256] - - [690, 4741.32] + - [693, 4741.32] - - [6784, 1024, 1, 3328] - - [692, 8769.4] + - [695, 8769.4] - - [6784, 2944, 1, 3328] - - [689, 7319.64] + - [692, 7319.64] - - [2944, 5056, 1, 3328] - - [679, 8889.66] + - [682, 8889.66] - - [1856, 1856, 1, 256] - - [682, 6309.74] + - [685, 6309.74] - - [1024, 5888, 1, 128] - - [673, 3759.5] + - [676, 3759.5] - - [6784, 2368, 1, 1280] - - [682, 8298.3] + - [685, 8298.3] - - [256, 4, 1, 128] - - [701, 7.00171] + - [704, 7.00171] - - [4288, 5888, 1, 1280] - - [686, 8365.18] + - [689, 8365.18] - - [4288, 4288, 1, 256] - - [686, 6513.68] + - [689, 6513.68] - - [8448, 32, 1, 2816] - - [657, 4257.64] + - [660, 4257.64] - - [448, 2944, 1, 3328] - - [690, 6875.52] + - [693, 6875.52] - - [5888, 4, 1, 128] - - [701, 163.84] + - [704, 163.84] - - [4288, 1856, 1, 1280] - - [686, 8402.81] + - [689, 8402.81] - - [1856, 2944, 1, 3328] - - [686, 6612.11] + - [689, 6612.11] - - [256, 6784, 1, 3328] - - [687, 7358.6] + - [690, 7358.6] - - [64, 5888, 1, 256] - - [681, 3358.95] + - [684, 3358.95] - - [256, 5056, 1, 128] - - [674, 2489.11] + - [677, 2489.11] - - [5056, 1024, 1, 256] - - [692, 8077.77] + - [695, 8077.77] - - [704, 64, 1, 3328] - - [636, 3288.3] + - [639, 3288.3] - - [5056, 1856, 1, 3328] - - [690, 8171.03] + - [693, 8171.03] - - [4, 2944, 1, 3328] - - [649, 546.743] + - [652, 546.743] - - [4, 5056, 1, 256] - - [624, 378.461] + - [627, 378.461] - - [1856, 1408, 1, 256] - - [692, 6320.78] + - [695, 6320.78] - - [8448, 12000, 1, 2816] - - [690, 7365.77] + - [693, 7365.77] - - [6784, 128, 1, 3328] - - [687, 6366.47] + - [690, 6366.47] - - [4288, 1408, 1, 128] - - [670, 4451.6] + - [673, 4451.6] - - [1856, 5888, 1, 3328] - - [688, 8619.66] + - [691, 8619.66] - - [4288, 5056, 1, 256] - - [692, 7288.95] + - [695, 7288.95] - - [1408, 128, 1, 1280] - - [630, 4291.05] + - [633, 4291.05] - - [4096, 800, 1, 1024] - - [681, 5867.79] + - [684, 5867.79] - - [5056, 256, 1, 3328] - - [687, 7527.51] + - [690, 7527.51] - - [704, 704, 1, 256] - - [687, 4417.75] + - [690, 4417.75] - - [1024, 5888, 1, 1280] - - [692, 8674.47] + - [695, 8674.47] - - [6784, 2368, 1, 128] - - [670, 4723.98] + - [673, 4723.98] - - [4, 5056, 1, 1280] - - [639, 540.207] + - [642, 540.207] - - [256, 64, 1, 1280] - - [641, 1515.28] + - [644, 1515.28] - - [128, 1856, 1, 1280] - - [681, 4574.11] + - [684, 4574.11] - - [1856, 1024, 1, 1280] - - [686, 7741.51] + - [689, 7741.51] - - [6784, 4288, 1, 1280] - - [692, 8521.19] + - [695, 8521.19] - - [2560, 64, 1, 2560] - - [623, 3504.6] + - [626, 3504.6] - - [1856, 1856, 1, 1280] - - [682, 7779.21] + - [685, 7779.21] - - [4096, 400, 1, 1024] - - [692, 4157.71] + - [695, 4157.71] - - [3072, 24000, 1, 1024] - - [692, 8663.35] + - [695, 8663.35] - - [128, 4288, 1, 3328] - - [638, 5674.13] + - [641, 5674.13] - - [4, 2368, 1, 3328] - - [649, 525.38] + - [652, 525.38] - - [5888, 1856, 1, 128] - - [674, 4099.64] + - [677, 4099.64] - - [448, 704, 1, 1280] - - [687, 4309.37] + - [690, 4309.37] - - [128, 5056, 1, 1280] - - [630, 5068.36] + - [633, 5068.36] - - [1024, 448, 1, 3328] - - [690, 6077.72] + - [693, 6077.72] - - [1856, 704, 1, 1280] - - [698, 6257.39] + - [701, 6257.39] - - [5056, 3584, 1, 128] - - [671, 4598.42] + - [674, 4598.42] - - [5888, 5888, 1, 3328] - - [692, 8058.15] + - [695, 8058.15] - - [6784, 1024, 1, 256] - - [692, 5120.89] + - [695, 5120.89] - - [2944, 2368, 1, 256] - - [683, 6522.93] + - [686, 6522.93] - - [256, 448, 1, 256] - - [633, 1816.84] + - [636, 1816.84] - - [5056, 5888, 1, 3328] - - [685, 6722.31] + - [688, 6722.31] - - [1856, 1024, 1, 256] - - [692, 6632.21] + - [695, 6632.21] - - [512, 48000, 1, 1536] - - [686, 8555.91] + - [689, 8555.91] - - [3584, 448, 1, 1280] - - [681, 6566.99] + - [684, 6566.99] - - [8448, 5984, 1, 2816] - - [686, 8990.56] + - [689, 8990.56] - - [448, 5888, 1, 256] - - [686, 6220.37] + - [689, 6220.37] - - [704, 64, 1, 128] - - [599, 450.56] + - [602, 450.56] - - [1408, 6784, 1, 3328] - - [679, 8478.58] + - [682, 8478.58] - - [448, 1024, 1, 128] - - [678, 1844.23] + - [681, 1844.23] - - [4288, 704, 1, 128] - - [674, 3895.16] + - [677, 3895.16] - - [128, 1856, 1, 128] - - [605, 1456.36] + - [608, 1456.36] - - [448, 2368, 1, 3328] - - [684, 5537.94] + - [687, 5537.94] - - [5056, 64, 1, 128] - - [670, 1648.84] + - [673, 1648.84] - - [5056, 2944, 1, 256] - - [686, 8230.77] + - [689, 8230.77] - - [6784, 5888, 1, 128] - - [670, 4873.09] + - [673, 4873.09] - - [1024, 700, 1, 512] - - [684, 4445.27] + - [687, 4445.27] - - [704, 1024, 1, 256] - - [682, 4707.89] + - [685, 4707.89] - - [1024, 4, 1, 256] - - [624, 174.763] + - [627, 174.763] - - [2944, 704, 1, 128] - - [674, 3483.32] + - [677, 3483.32] - - [128, 6784, 1, 1280] - - [682, 6522.83] + - [685, 6522.83] - - [1408, 3584, 1, 3328] - - [686, 8673.49] + - [689, 8673.49] - - [2368, 6784, 1, 256] - - [682, 7941.66] + - [685, 7941.66] - - [5056, 1408, 1, 1280] - - [686, 8800.91] + - [689, 8800.91] - - [256, 256, 1, 128] - - [611, 551.882] + - [614, 551.882] - - [5056, 4288, 1, 128] - - [678, 3793.54] + - [681, 3793.54] - - [1408, 1856, 1, 128] - - [670, 3067.64] + - [673, 3067.64] - - [1408, 5888, 1, 3328] - - [686, 9148.87] + - [689, 9148.87] - - [1856, 256, 1, 256] - - [682, 4319.42] + - [685, 4319.42] - - [6784, 6784, 1, 256] - - [682, 7668.43] + - [685, 7668.43] - - [64, 256, 1, 128] - - [616, 131.072] + - [619, 131.072] - - [4288, 2368, 1, 128] - - [671, 4582.89] + - [674, 4582.89] - - [256, 4288, 1, 1280] - - [681, 6058.51] + - [684, 6058.51] - - [2368, 2944, 1, 256] - - [686, 8015.97] + - [689, 8015.97] - - [4, 1856, 1, 256] - - [703, 252.732] + - [706, 252.732] - - [3584, 1856, 1, 1280] - - [682, 7760.14] + - [685, 7760.14] - - [6784, 6784, 1, 128] - - [671, 4970.04] + - [674, 4970.04] - - [256, 1856, 1, 128] - - [677, 1580.49] + - [680, 1580.49] - - [704, 64, 1, 1280] - - [666, 2556.37] + - [669, 2556.37] - - [5888, 5056, 1, 256] - - [686, 8216.57] + - [689, 8216.57] - - [8448, 48000, 1, 2816] - - [692, 4082.79] + - [695, 4082.79] - - [3584, 448, 1, 256] - - [686, 5518.82] + - [689, 5518.82] - - [448, 4288, 1, 128] - - [674, 3415.15] + - [677, 3415.15] - - [7680, 64, 1, 2560] - - [635, 5162.0] + - [638, 5162.0] - - [256, 6784, 1, 256] - - [686, 6272.52] + - [689, 6272.52] - - [1408, 4288, 1, 128] - - [674, 4343.53] + - [677, 4343.53] - - [2944, 704, 1, 3328] - - [681, 7679.61] + - [684, 7679.61] - - [128, 448, 1, 256] - - [621, 1422.49] + - [624, 1422.49] - - [5056, 256, 1, 1280] - - [688, 5052.29] + - [691, 5052.29] - - [2560, 32, 1, 2560] - - [644, 3105.97] + - [647, 3105.97] - - [3584, 3584, 1, 256] - - [692, 8260.47] + - [695, 8260.47] - - [448, 1408, 1, 128] - - [670, 2397.28] + - [673, 2397.28] - - [128, 256, 1, 1280] - - [625, 2340.57] + - [628, 2340.57] - - [3584, 5056, 1, 256] - - [692, 7347.46] + - [695, 7347.46] - - [6784, 128, 1, 256] - - [682, 5591.0] + - [685, 5591.0] - - [4288, 4, 1, 256] - - [624, 354.106] + - [627, 354.106] - - [704, 448, 1, 256] - - [687, 3492.23] + - [690, 3492.23] - - [2944, 2368, 1, 1280] - - [694, 6661.61] + - [697, 6661.61] - - [448, 64, 1, 3328] - - [666, 3058.35] + - [669, 3058.35] - - [1408, 3584, 1, 256] - - [692, 7966.49] + - [695, 7966.49] - - [3584, 4, 1, 3328] - - [705, 605.459] + - [708, 605.459] - - [6784, 3584, 1, 256] - - [682, 7525.31] + - [685, 7525.31] - - [256, 128, 1, 128] - - [614, 275.941] + - [617, 275.941] - - [704, 1408, 1, 128] - - [671, 3109.75] + - [674, 3109.75] - - [4, 2368, 1, 256] - - [705, 283.275] + - [708, 283.275] - - [4288, 128, 1, 1280] - - [687, 5132.55] + - [690, 5132.55] - - [128, 1408, 1, 256] - - [681, 2733.25] + - [684, 2733.25] - - [4, 2944, 1, 256] - - [703, 314.027] + - [706, 314.027] - - [64, 128, 1, 3328] - - [651, 1514.61] + - [654, 1514.61] - - [5056, 2368, 1, 128] - - [675, 3449.07] + - [678, 3449.07] - - [2944, 2944, 1, 3328] - - [679, 8168.93] + - [682, 8168.93] - - [5056, 6784, 1, 256] - - [699, 5792.67] + - [702, 5792.67] - - [1856, 3584, 1, 128] - - [676, 4213.4] + - [679, 4213.4] - - [128, 2944, 1, 128] - - [600, 1970.36] + - [603, 1970.36] - - [35, 8457, 1, 2560] - - [595, 3525.05] + - [598, 3525.05] - - [1024, 704, 1, 3328] - - [681, 6784.89] + - [684, 6784.89] - - [6784, 448, 1, 256] - - [690, 6544.78] + - [693, 6544.78] - - [3584, 6784, 1, 128] - - [670, 4623.5] + - [673, 4623.5] - - [128, 4288, 1, 256] - - [684, 3606.5] + - [687, 3606.5] - - [704, 448, 1, 3328] - - [681, 4477.91] + - [684, 4477.91] - - [128, 128, 1, 3328] - - [666, 2177.55] + - [669, 2177.55] - - [5056, 1856, 1, 256] - - [700, 5608.62] + - [703, 5608.62] - - [4608, 5984, 1, 1536] - - [689, 7859.75] + - [692, 7859.75] - - [256, 128, 1, 256] - - [625, 998.644] + - [628, 998.644] - - [1760, 3200, 1, 1760] - - [682, 8179.54] + - [685, 8179.54] - - [1024, 1856, 1, 256] - - [692, 6143.17] + - [695, 6143.17] - - [4096, 1600, 1, 1024] - - [700, 5851.42] + - [703, 5851.42] - - [4288, 64, 1, 128] - - [605, 1372.16] + - [608, 1372.16] - - [256, 448, 1, 3328] - - [644, 4795.0] + - [647, 4795.0] - - [1408, 6784, 1, 1280] - - [686, 8426.4] + - [689, 8426.4] - - [3584, 3584, 1, 1280] - - [686, 7556.46] + - [689, 7556.46] - - [7680, 24000, 1, 2560] - - [679, 5019.09] + - [682, 5019.09] - - [64, 2368, 1, 1280] - - [630, 4061.7] + - [633, 4061.7] - - [448, 2368, 1, 1280] - - [681, 5928.67] + - [684, 5928.67] - - [4608, 48000, 1, 1536] - - [686, 6937.3] + - [689, 6937.3] - - [5888, 5888, 1, 128] - - [671, 3743.9] + - [674, 3743.9] - - [64, 6784, 1, 3328] - - [681, 5988.62] + - [684, 5988.62] - - [2944, 256, 1, 1280] - - [687, 6717.87] + - [690, 6717.87] - - [2048, 16, 1, 2048] - - [639, 1210.48] + - [642, 1210.48] - - [256, 2368, 1, 128] - - [674, 1935.97] + - [677, 1935.97] - - [5056, 2368, 1, 3328] - - [692, 8875.53] + - [695, 8875.53] - - [2944, 4288, 1, 256] - - [686, 8063.14] + - [689, 8063.14] - - [1408, 3584, 1, 1280] - - [682, 8196.97] + - [685, 8196.97] - - [2368, 64, 1, 256] - - [681, 2365.69] + - [684, 2365.69] - - [64, 448, 1, 3328] - - [667, 3027.3] + - [670, 3027.3] - - [704, 128, 1, 3328] - - [638, 4452.09] + - [641, 4452.09] - - [8192, 1600, 1, 2048] - - [686, 7229.83] + - [689, 7229.83] - - [1856, 704, 1, 256] - - [688, 5545.35] + - [691, 5545.35] - - [4, 4288, 1, 1280] - - [639, 523.725] + - [642, 523.725] - - [1408, 448, 1, 3328] - - [693, 4789.3] + - [696, 4789.3] - - [1024, 4, 1, 3328] - - [619, 504.123] + - [622, 504.123] - - [512, 24000, 1, 2560] - - [692, 8903.52] + - [695, 8903.52] - - [2368, 6784, 1, 3328] - - [692, 8311.04] + - [695, 8311.04] - - [1856, 1408, 1, 1280] - - [682, 8160.01] + - [685, 8160.01] - - [1856, 448, 1, 1280] - - [684, 6242.97] + - [687, 6242.97] - - [6784, 704, 1, 128] - - [670, 4068.95] + - [673, 4068.95] - - [4, 4, 1, 256] - - [639, 0.742029] + - [642, 0.742029] - - [128, 5888, 1, 128] - - [670, 2327.92] + - [673, 2327.92] - - [1408, 5888, 1, 256] - - [681, 6986.81] + - [684, 6986.81] - - [704, 2944, 1, 1280] - - [682, 7904.93] + - [685, 7904.93] - - [4288, 64, 1, 1280] - - [657, 3828.17] + - [660, 3828.17] - - [256, 64, 1, 256] - - [632, 655.36] + - [635, 655.36] - - [704, 1856, 1, 256] - - [690, 5444.27] + - [693, 5444.27] - - [704, 6784, 1, 128] - - [670, 4319.67] + - [673, 4319.67] - - [3584, 704, 1, 1280] - - [690, 7726.33] + - [693, 7726.33] - - [256, 128, 1, 1280] - - [625, 2184.53] + - [628, 2184.53] - - [5888, 2368, 1, 256] - - [692, 8192.59] + - [695, 8192.59] - - [256, 2368, 1, 1280] - - [687, 5675.44] + - [690, 5675.44] - - [2944, 6784, 1, 128] - - [675, 4248.25] + - [678, 4248.25] - - [3584, 448, 1, 3328] - - [686, 6560.67] + - [689, 6560.67] - - [1408, 4, 1, 256] - - [704, 176.69] + - [707, 176.69] - - [704, 2368, 1, 3328] - - [687, 7085.21] + - [690, 7085.21] - - [2944, 448, 1, 256] - - [683, 3411.9] + - [686, 3411.9] - - [1856, 448, 1, 128] - - [671, 2748.72] + - [674, 2748.72] - - [4288, 4, 1, 3328] - - [639, 553.548] + - [642, 553.548] - - [2368, 128, 1, 1280] - - [660, 4173.55] + - [663, 4173.55] - - [256, 5888, 1, 128] - - [675, 2860.88] + - [678, 2860.88] - - [64, 6784, 1, 256] - - [688, 3637.08] + - [691, 3637.08] - - [64, 5056, 1, 1280] - - [687, 4289.43] + - [690, 4289.43] - - [4, 6784, 1, 128] - - [701, 160.806] + - [704, 160.806] - - [2048, 3200, 1, 512] - - [688, 6926.99] + - [691, 6926.99] - - [2944, 2944, 1, 1280] - - [680, 6267.75] + - [683, 6267.75] - - [5056, 448, 1, 3328] - - [681, 7400.26] + - [684, 7400.26] - - [4, 3584, 1, 1280] - - [639, 499.73] + - [642, 499.73] - - [1408, 128, 1, 128] - - [616, 1037.26] + - [619, 1037.26] - - [6784, 704, 1, 3328] - - [687, 7633.85] + - [690, 7633.85] - - [128, 64, 1, 1280] - - [639, 1170.29] + - [642, 1170.29] - - [2368, 256, 1, 1280] - - [687, 5609.79] + - [690, 5609.79] - - [4, 448, 1, 3328] - - [707, 358.4] + - [710, 358.4] - - [5888, 4288, 1, 128] - - [675, 4521.64] + - [678, 4521.64] - - [4, 5888, 1, 256] - - [639, 353.833] + - [642, 353.833] - - [1408, 2944, 1, 3328] - - [680, 8951.31] + - [683, 8951.31] - - [3584, 704, 1, 128] - - [670, 3395.31] + - [673, 3395.31] - - [4608, 12000, 1, 1536] - - [679, 6609.89] + - [682, 6609.89] - - [64, 1024, 1, 256] - - [625, 1588.75] + - [628, 1588.75] - - [5056, 5056, 1, 128] - - [670, 4080.71] + - [673, 4080.71] - - [2368, 448, 1, 1280] - - [681, 5422.94] + - [684, 5422.94] - - [128, 3584, 1, 256] - - [687, 4705.15] + - [690, 4705.15] - - [704, 448, 1, 1280] - - [684, 3960.97] + - [687, 3960.97] - - [8192, 800, 1, 2048] - - [682, 6306.26] + - [685, 6306.26] - - [448, 5056, 1, 128] - - [674, 3709.46] + - [677, 3709.46] - - [256, 4, 1, 1280] - - [706, 163.84] + - [709, 163.84] - - [5056, 3584, 1, 256] - - [679, 7008.24] + - [682, 7008.24] - - [2368, 4, 1, 3328] - - [639, 496.266] + - [642, 496.266] - - [1408, 5056, 1, 128] - - [674, 4175.27] + - [677, 4175.27] - - [2944, 3584, 1, 128] - - [670, 4659.69] + - [673, 4659.69] - - [3584, 2368, 1, 256] - - [692, 5851.77] + - [695, 5851.77] - - [128, 3584, 1, 3328] - - [682, 6104.94] + - [685, 6104.94] - - [128, 1024, 1, 1280] - - [622, 3847.99] + - [625, 3847.99] - - [8448, 24000, 1, 2816] - - [692, 5128.54] + - [695, 5128.54] - - [64, 704, 1, 256] - - [625, 1253.73] + - [628, 1253.73] - - [4288, 256, 1, 1280] - - [681, 5625.76] + - [684, 5625.76] - - [3584, 3584, 1, 3328] - - [686, 8206.05] + - [689, 8206.05] - - [4, 704, 1, 128] - - [701, 29.4484] + - [704, 29.4484] - - [5888, 6784, 1, 256] - - [688, 8248.65] + - [691, 8248.65] - - [4288, 2944, 1, 3328] - - [686, 8657.02] + - [689, 8657.02] - - [2944, 64, 1, 128] - - [605, 1240.6] + - [608, 1240.6] - - [1024, 128, 1, 3328] - - [630, 4433.0] + - [633, 4433.0] - - [1024, 16, 1, 500000] - - [593, 2571.05] + - [596, 2571.05] - - [4288, 128, 1, 3328] - - [630, 5716.75] + - [633, 5716.75] - - [7680, 128, 1, 2560] - - [628, 5488.0] + - [631, 5488.0] - - [256, 5056, 1, 1280] - - [688, 6379.96] + - [691, 6379.96] - - [1408, 256, 1, 128] - - [674, 1633.73] + - [677, 1633.73] - - [2944, 5888, 1, 3328] - - [683, 7848.92] + - [686, 7848.92] - - [6784, 5888, 1, 1280] - - [692, 9047.62] + - [695, 9047.62] - - [2048, 800, 1, 512] - - [687, 4841.07] + - [690, 4841.07] - - [704, 128, 1, 256] - - [632, 1567.17] + - [635, 1567.17] - - [5888, 4288, 1, 1280] - - [686, 7982.83] + - [689, 7982.83] - - [1024, 24000, 1, 2048] - - [688, 5774.3] + - [691, 5774.3] - - [448, 256, 1, 1280] - - [622, 3707.09] + - [625, 3707.09] - - [5888, 3584, 1, 128] - - [675, 3804.4] + - [678, 3804.4] - - [1024, 2944, 1, 128] - - [670, 3308.26] + - [673, 3308.26] - - [5056, 4, 1, 1280] - - [703, 468.962] + - [706, 468.962] - - [256, 1408, 1, 1280] - - [681, 4899.89] + - [684, 4899.89] - - [3072, 16, 1, 1024] - - [639, 1233.62] + - [642, 1233.62] - - [704, 3584, 1, 128] - - [670, 3919.43] + - [673, 3919.43] - - [5888, 448, 1, 3328] - - [700, 6095.61] + - [703, 6095.61] - - [2368, 4288, 1, 1280] - - [682, 8338.3] + - [685, 8338.3] - - [4288, 2944, 1, 128] - - [674, 3946.5] + - [677, 3946.5] - - [1024, 6784, 1, 3328] - - [688, 7494.28] + - [691, 7494.28] - - [128, 2368, 1, 256] - - [687, 2895.32] + - [690, 2895.32] - - [6784, 64, 1, 3328] - - [681, 5964.89] + - [684, 5964.89] - - [5056, 2944, 1, 3328] - - [692, 6605.53] + - [695, 6605.53] - - [448, 128, 1, 256] - - [625, 1339.42] + - [628, 1339.42] - - [2944, 3584, 1, 256] - - [688, 7165.56] + - [691, 7165.56] - - [1408, 1408, 1, 3328] - - [692, 8332.86] + - [695, 8332.86] - - [1856, 128, 1, 1280] - - [687, 4498.33] + - [690, 4498.33] - - [3584, 3584, 1, 128] - - [671, 4000.01] + - [674, 4000.01] - - [64, 3584, 1, 256] - - [698, 2383.13] + - [701, 2383.13] - - [1408, 4, 1, 3328] - - [649, 422.908] + - [652, 422.908] - - [128, 2944, 1, 3328] - - [654, 5429.93] + - [657, 5429.93] - - [3584, 704, 1, 256] - - [687, 6153.99] + - [690, 6153.99] - - [2944, 448, 1, 3328] - - [687, 6507.72] + - [690, 6507.72] - - [3584, 1408, 1, 3328] - - [692, 8829.63] + - [695, 8829.63] - - [704, 3584, 1, 1280] - - [682, 7860.23] + - [685, 7860.23] - - [2944, 6784, 1, 1280] - - [692, 8894.5] + - [695, 8894.5] - - [1856, 6784, 1, 256] - - [692, 8115.09] + - [695, 8115.09] - - [4288, 448, 1, 3328] - - [684, 6397.25] + - [687, 6397.25] - - [6784, 4288, 1, 128] - - [670, 4109.44] + - [673, 4109.44] - - [6784, 704, 1, 1280] - - [680, 7999.04] + - [683, 7999.04] - - [256, 4288, 1, 256] - - [684, 4603.84] + - [687, 4603.84] - - [3584, 6784, 1, 256] - - [692, 7361.55] + - [695, 7361.55] - - [6144, 12000, 1, 2048] - - [691, 6311.66] + - [694, 6311.66] - - [6144, 16, 1, 2560] - - [640, 2240.55] + - [643, 2240.55] - - [3584, 64, 1, 128] - - [611, 1292.26] + - [614, 1292.26] - - [5888, 1024, 1, 3328] - - [679, 8394.49] + - [682, 8394.49] - - [448, 64, 1, 128] - - [602, 262.144] + - [605, 262.144] - - [704, 6784, 1, 1280] - - [686, 7740.56] + - [689, 7740.56] - - [4, 1024, 1, 1280] - - [639, 378.821] + - [642, 378.821] - - [5888, 128, 1, 256] - - [687, 5003.58] + - [690, 5003.58] - - [4096, 16, 1, 4096] - - [639, 1585.75] + - [642, 1585.75] - - [1856, 5056, 1, 3328] - - [680, 8522.82] + - [683, 8522.82] - - [4, 6784, 1, 256] - - [624, 387.657] + - [627, 387.657] - - [1024, 3584, 1, 128] - - [674, 3031.51] + - [677, 3031.51] - - [1024, 1408, 1, 128] - - [676, 2600.75] + - [679, 2600.75] - - [2368, 2944, 1, 128] - - [673, 4340.16] + - [676, 4340.16] - - [5056, 64, 1, 256] - - [687, 3109.52] + - [690, 3109.52] - - [4, 448, 1, 1280] - - [707, 253.735] + - [710, 253.735] - - [5056, 2944, 1, 128] - - [678, 3739.91] + - [681, 3739.91] - - [5888, 5056, 1, 3328] - - [692, 9016.38] + - [695, 9016.38] - - [1024, 704, 1, 128] - - [674, 2363.56] + - [677, 2363.56] - - [5888, 2368, 1, 128] - - [677, 3651.73] + - [680, 3651.73] - - [128, 5056, 1, 3328] - - [681, 6243.54] + - [684, 6243.54] - - [3584, 6784, 1, 1280] - - [679, 9080.57] + - [682, 9080.57] - - [448, 4, 1, 1280] - - [707, 242.983] + - [710, 242.983] - - [1856, 5888, 1, 256] - - [692, 8182.02] + - [695, 8182.02] - - [256, 256, 1, 256] - - [625, 1542.02] + - [628, 1542.02] - - [256, 64, 1, 128] - - [606, 135.126] + - [609, 135.126] - - [4288, 4288, 1, 3328] - - [692, 8674.54] + - [695, 8674.54] - - [4288, 1408, 1, 1280] - - [680, 7867.08] + - [683, 7867.08] - - [3584, 5056, 1, 128] - - [670, 4457.73] + - [673, 4457.73] - - [4, 1024, 1, 3328] - - [619, 440.294] + - [622, 440.294] - - [4288, 2368, 1, 256] - - [700, 5699.47] + - [703, 5699.47] - - [2944, 5056, 1, 1280] - - [692, 8236.46] + - [695, 8236.46] - - [448, 6784, 1, 256] - - [682, 6620.52] + - [685, 6620.52] - - [64, 128, 1, 128] - - [607, 67.5629] + - [610, 67.5629] - - [1856, 2368, 1, 128] - - [674, 4233.6] + - [677, 4233.6] - - [6784, 2368, 1, 3328] - - [692, 8269.8] + - [695, 8269.8] - - [256, 1024, 1, 1280] - - [681, 4882.78] + - [684, 4882.78] - - [704, 4, 1, 128] - - [701, 19.011] + - [704, 19.011] - - [256, 4, 1, 256] - - [639, 46.8114] + - [642, 46.8114] - - [4288, 128, 1, 256] - - [687, 4273.39] + - [690, 4273.39] - - [4288, 1856, 1, 3328] - - [682, 8195.71] + - [685, 8195.71] - - [3584, 448, 1, 128] - - [675, 2750.55] + - [678, 2750.55] - - [2048, 1600, 1, 2048] - - [698, 5753.49] + - [701, 5753.49] - - [256, 4, 1, 3328] - - [708, 297.878] + - [711, 297.878] - - [4, 1408, 1, 1280] - - [706, 402.286] + - [709, 402.286] - - [3584, 64, 1, 1280] - - [695, 4096.0] + - [698, 4096.0] - - [1408, 448, 1, 128] - - [670, 2498.15] + - [673, 2498.15] - - [3584, 1024, 1, 1280] - - [692, 7252.08] + - [695, 7252.08] - - [1856, 5056, 1, 256] - - [686, 7711.49] + - [689, 7711.49] - - [4, 3584, 1, 256] - - [703, 314.214] + - [706, 314.214] - - [4, 2944, 1, 1280] - - [639, 483.118] + - [642, 483.118] - - [1024, 4288, 1, 256] - - [691, 6544.42] + - [694, 6544.42] - - [5888, 3584, 1, 3328] - - [680, 8105.05] + - [683, 8105.05] - - [1856, 4, 1, 256] - - [639, 252.732] + - [642, 252.732] - - [4, 256, 1, 256] - - [624, 48.1882] + - [627, 48.1882] - - [5056, 3584, 1, 3328] - - [685, 7354.7] + - [688, 7354.7] - - [704, 448, 1, 128] - - [678, 1233.81] + - [681, 1233.81] - - [2368, 1408, 1, 1280] - - [686, 6654.14] + - [689, 6654.14] - - [5056, 2944, 1, 1280] - - [692, 8505.62] + - [695, 8505.62] - - [4, 4, 1, 128] - - [702, 0.0478505] + - [705, 0.0478505] - - [3584, 256, 1, 256] - - [684, 4616.37] + - [687, 4616.37] - - [1024, 6784, 1, 256] - - [686, 7944.88] + - [689, 7944.88] - - [4, 128, 1, 256] - - [639, 29.2571] + - [642, 29.2571] - - [64, 64, 1, 1280] - - [650, 642.51] + - [653, 642.51] - - [5124, 9124, 1, 2048] - - [692, 8019.3] + - [695, 8019.3] - - [6784, 4, 1, 128] - - [701, 192.967] + - [704, 192.967] - - [2944, 1408, 1, 128] - - [670, 3827.03] + - [673, 3827.03] - - [448, 128, 1, 3328] - - [643, 4063.9] + - [646, 4063.9] - - [3584, 1408, 1, 1280] - - [692, 7180.73] + - [695, 7180.73] - - [64, 4288, 1, 3328] - - [638, 4786.74] + - [641, 4786.74] - - [5056, 6784, 1, 3328] - - [679, 7889.73] + - [682, 7889.73] - - [128, 2944, 1, 256] - - [682, 3599.59] + - [685, 3599.59] - - [128, 6784, 1, 128] - - [600, 2606.69] + - [603, 2606.69] - - [3584, 4288, 1, 256] - - [686, 7299.71] + - [689, 7299.71] - - [448, 1856, 1, 256] - - [682, 5206.97] + - [685, 5206.97] - - [1856, 6784, 1, 3328] - - [684, 8386.26] + - [687, 8386.26] - - [3584, 128, 1, 3328] - - [628, 5589.94] + - [631, 5589.94] - - [64, 1856, 1, 256] - - [621, 1949.28] + - [624, 1949.28] - - [64, 448, 1, 256] - - [626, 955.733] + - [629, 955.733] - - [5888, 4288, 1, 256] - - [690, 7791.74] + - [693, 7791.74] - - [4, 448, 1, 128] - - [701, 8.74146] + - [704, 8.74146] - - [5056, 1408, 1, 256] - - [692, 5153.91] + - [695, 5153.91] - - [35, 8457, 1, 2048] - - [597, 3182.47] + - [600, 3182.47] - - [64, 256, 1, 1280] - - [646, 1713.36] + - [649, 1713.36] - - [3584, 1024, 1, 256] - - [682, 6528.08] + - [685, 6528.08] - - [256, 704, 1, 256] - - [681, 2720.36] + - [684, 2720.36] - - [5888, 5888, 1, 256] - - [690, 7992.16] + - [693, 7992.16] - - [4288, 1024, 1, 1280] - - [684, 7837.4] + - [687, 7837.4] - - [5888, 128, 1, 3328] - - [687, 7181.03] + - [690, 7181.03] - - [448, 6784, 1, 3328] - - [681, 7663.0] + - [684, 7663.0] - - [2944, 1408, 1, 1280] - - [690, 7903.04] + - [693, 7903.04] - - [64, 128, 1, 1280] - - [639, 1191.56] + - [642, 1191.56] - - [2944, 1856, 1, 3328] - - [680, 7844.31] + - [683, 7844.31] - - [2368, 64, 1, 128] - - [611, 997.873] + - [614, 997.873] - - [256, 1024, 1, 128] - - [670, 1215.74] + - [673, 1215.74] - - [3584, 5888, 1, 1280] - - [679, 8958.84] + - [682, 8958.84] - - [64, 4, 1, 128] - - [702, 1.11608] + - [705, 1.11608] - - [6784, 1856, 1, 1280] - - [679, 6728.7] + - [682, 6728.7] - - [2944, 5056, 1, 256] - - [692, 8275.11] + - [695, 8275.11] - - [4288, 4, 1, 128] - - [701, 147.544] + - [704, 147.544] - - [5888, 256, 1, 3328] - - [688, 7094.1] + - [691, 7094.1] - - [2944, 4288, 1, 128] - - [673, 4611.45] + - [676, 4611.45] - - [3584, 1408, 1, 256] - - [683, 6542.96] + - [686, 6542.96] - - [704, 3584, 1, 3328] - - [682, 8117.1] + - [685, 8117.1] - - [4096, 3200, 1, 1024] - - [697, 6656.03] + - [700, 6656.03] - - [5056, 448, 1, 1280] - - [695, 6096.1] + - [698, 6096.1] - - [3584, 1856, 1, 3328] - - [680, 8552.31] + - [683, 8552.31] - - [4288, 6784, 1, 1280] - - [686, 8212.36] + - [689, 8212.36] - - [2560, 7000, 1, 2560] - - [688, 7655.24] + - [691, 7655.24] - - [1408, 704, 1, 1280] - - [684, 5756.69] + - [687, 5756.69] - - [2944, 1024, 1, 256] - - [692, 6880.81] + - [695, 6880.81] - - [6784, 64, 1, 256] - - [687, 4438.86] + - [690, 4438.86] - - [2368, 4288, 1, 3328] - - [688, 8377.89] + - [691, 8377.89] - - [4, 1408, 1, 256] - - [705, 222.499] + - [708, 222.499] - - [1024, 1408, 1, 1280] - - [682, 6339.28] + - [685, 6339.28] - - [64, 64, 1, 256] - - [639, 187.246] + - [642, 187.246] - - [704, 256, 1, 3328] - - [681, 4046.04] + - [684, 4046.04] - - [6784, 5056, 1, 256] - - [692, 7972.07] + - [695, 7972.07] - - [1856, 1856, 1, 128] - - [676, 3716.51] + - [679, 3716.51] - - [3584, 5056, 1, 3328] - - [692, 8684.66] + - [695, 8684.66] - - [448, 6784, 1, 128] - - [674, 3828.95] + - [677, 3828.95] - - [4, 704, 1, 3328] - - [707, 393.106] + - [710, 393.106] - - [35, 8457, 1, 4096] - - [596, 3173.14] + - [599, 3173.14] - - [448, 2944, 1, 256] - - [690, 5553.31] + - [693, 5553.31] - - [4, 4288, 1, 3328] - - [649, 573.111] + - [652, 573.111] - - [2944, 6784, 1, 256] - - [686, 8565.96] + - [689, 8565.96] - - [2944, 2944, 1, 128] - - [670, 4540.73] + - [673, 4540.73] - - [4, 4, 1, 1280] - - [649, 3.04762] + - [652, 3.04762] - - [1856, 3584, 1, 1280] - - [686, 7306.26] + - [689, 7306.26] - - [64, 2944, 1, 256] - - [698, 2292.51] + - [701, 2292.51] - - [448, 256, 1, 128] - - [607, 797.83] + - [610, 797.83] - - [4288, 448, 1, 128] - - [673, 3430.4] + - [676, 3430.4] - - [4608, 24000, 1, 1536] - - [691, 6820.14] + - [694, 6820.14] - - [1856, 1408, 1, 3328] - - [694, 6600.14] + - [697, 6600.14] - - [128, 128, 1, 128] - - [599, 161.817] + - [602, 161.817] - - [1024, 4288, 1, 3328] - - [682, 7936.98] + - [685, 7936.98] - - [448, 2368, 1, 256] - - [690, 4526.35] + - [693, 4526.35] - - [1024, 4, 1, 128] - - [702, 16.8907] + - [705, 16.8907] - - [64, 1408, 1, 1280] - - [622, 3345.22] + - [625, 3345.22] - - [64, 6784, 1, 1280] - - [687, 5526.5] + - [690, 5526.5] - - [5056, 448, 1, 256] - - [681, 4216.55] + - [684, 4216.55] - - [2944, 2368, 1, 3328] - - [692, 7000.32] + - [695, 7000.32] - - [704, 4288, 1, 3328] - - [698, 6414.33] + - [701, 6414.33] - - [1408, 128, 1, 256] - - [681, 2720.36] + - [684, 2720.36] - - [1024, 1856, 1, 1280] - - [692, 7682.83] + - [695, 7682.83] - - [2048, 6400, 1, 2048] - - [688, 7418.12] + - [691, 7418.12] - - [512, 48000, 1, 2816] - - [692, 8884.67] + - [695, 8884.67] - - [5124, 9124, 1, 2560] - - [684, 6040.7] + - [687, 6040.7] - - [128, 2368, 1, 3328] - - [638, 5025.56] + - [641, 5025.56] - - [1024, 5888, 1, 256] - - [686, 7322.11] + - [689, 7322.11] - - [64, 2944, 1, 1280] - - [622, 4222.21] + - [625, 4222.21] - - [5056, 64, 1, 3328] - - [663, 4936.22] + - [666, 4936.22] - - [128, 704, 1, 128] - - [608, 683.314] + - [611, 683.314] - - [1408, 2368, 1, 256] - - [687, 6404.12] + - [690, 6404.12] - - [1408, 1408, 1, 256] - - [692, 4537.83] + - [695, 4537.83] - - [4, 64, 1, 128] - - [701, 2.46747] + - [704, 2.46747] - - [64, 1024, 1, 128] - - [600, 532.272] + - [603, 532.272] - - [1024, 8, 1, 500000] - - [590, 1684.98] + - [593, 1684.98] - - [2368, 2368, 1, 128] - - [671, 4334.23] + - [674, 4334.23] - - [64, 5888, 1, 128] - - [600, 2003.09] + - [603, 2003.09] - - [5888, 4, 1, 3328] - - [618, 339.018] + - [621, 339.018] - - [6784, 1408, 1, 128] - - [674, 4431.13] + - [677, 4431.13] - - [4288, 5888, 1, 256] - - [692, 7800.78] + - [695, 7800.78] - - [1408, 5056, 1, 256] - - [686, 8153.28] + - [689, 8153.28] - - [5056, 128, 1, 3328] - - [643, 5829.83] + - [646, 5829.83] - - [128, 128, 1, 1280] - - [646, 1691.25] + - [649, 1691.25] - - [448, 704, 1, 256] - - [687, 3364.18] + - [690, 3364.18] - - [4288, 3584, 1, 128] - - [671, 2952.58] + - [674, 2952.58] - - [2944, 128, 1, 3328] - - [643, 5620.72] + - [646, 5620.72] - - [64, 1408, 1, 3328] - - [644, 4169.81] + - [647, 4169.81] - - [3584, 5056, 1, 1280] - - [689, 7780.66] + - [692, 7780.66] - - [256, 448, 1, 1280] - - [622, 3929.35] + - [625, 3929.35] - - [704, 704, 1, 128] - - [670, 2346.07] + - [673, 2346.07] - - [5056, 4, 1, 128] - - [701, 144.457] + - [704, 144.457] - - [704, 256, 1, 1280] - - [690, 2283.12] + - [693, 2283.12] - - [64, 2368, 1, 3328] - - [622, 4921.59] + - [625, 4921.59] - - [1856, 1024, 1, 128] - - [671, 3459.47] + - [674, 3459.47] - - [1856, 64, 1, 128] - - [603, 918.137] + - [606, 918.137] - - [4096, 64, 1, 4096] - - [648, 4000.52] + - [651, 4000.52] - - [1024, 24000, 1, 1536] - - [684, 8502.26] + - [687, 8502.26] - - [704, 4288, 1, 256] - - [688, 6003.73] + - [691, 6003.73] - - [5888, 2368, 1, 1280] - - [679, 8801.2] + - [682, 8801.2] - - [128, 256, 1, 256] - - [633, 1069.98] + - [636, 1069.98] - - [64, 128, 1, 256] - - [639, 374.491] + - [642, 374.491] - - [2368, 5888, 1, 1280] - - [682, 8308.53] + - [685, 8308.53] - - [5888, 256, 1, 1280] - - [690, 7154.32] + - [693, 7154.32] - - [1760, 128, 1, 1760] - - [631, 5363.81] + - [634, 5363.81] - - [4, 5888, 1, 1280] - - [639, 542.204] + - [642, 542.204] - - [704, 128, 1, 128] - - [611, 779.347] + - [614, 779.347] - - [1024, 4, 1, 1280] - - [639, 392.431] + - [642, 392.431] - - [2368, 1856, 1, 3328] - - [682, 7975.22] + - [685, 7975.22] - - [2368, 128, 1, 128] - - [604, 1584.86] + - [607, 1584.86] - - [2944, 704, 1, 256] - - [690, 4039.11] + - [693, 4039.11] - - [5056, 128, 1, 128] - - [670, 2575.79] + - [673, 2575.79] - - [2368, 1024, 1, 3328] - - [698, 6165.44] + - [701, 6165.44] - - [256, 704, 1, 3328] - - [681, 4028.64] + - [684, 4028.64] - - [704, 3584, 1, 256] - - [692, 6102.82] + - [695, 6102.82] - - [704, 2944, 1, 3328] - - [682, 8202.74] + - [685, 8202.74] - - [6784, 1024, 1, 128] - - [674, 4386.3] + - [677, 4386.3] - - [256, 448, 1, 128] - - [611, 834.095] + - [614, 834.095] - - [448, 1024, 1, 3328] - - [699, 5412.38] + - [702, 5412.38] - - [2944, 1024, 1, 3328] - - [692, 6265.77] + - [695, 6265.77] - - [2944, 5056, 1, 128] - - [670, 4770.78] + - [673, 4770.78] - - [2368, 256, 1, 256] - - [687, 3975.13] + - [690, 3975.13] - - [1408, 6784, 1, 256] - - [686, 7986.92] + - [689, 7986.92] - - [6784, 1408, 1, 3328] - - [686, 8472.61] + - [689, 8472.61] - - [4288, 6784, 1, 128] - - [677, 3865.1] + - [680, 3865.1] - - [704, 64, 1, 256] - - [625, 1287.31] + - [628, 1287.31] - - [5888, 4, 1, 1280] - - [624, 509.922] + - [627, 509.922] - - [256, 2368, 1, 3328] - - [687, 5837.55] + - [690, 5837.55] - - [6784, 2944, 1, 1280] - - [692, 8560.44] + - [695, 8560.44] - - [4288, 1856, 1, 128] - - [670, 4616.97] + - [673, 4616.97] - - [1856, 2944, 1, 128] - - [670, 4287.63] + - [673, 4287.63] - - [6784, 448, 1, 128] - - [674, 3893.33] + - [677, 3893.33] - - [64, 3584, 1, 128] - - [600, 1609.66] + - [603, 1609.66] - - [448, 5056, 1, 1280] - - [690, 7124.31] + - [693, 7124.31] - - [2368, 1856, 1, 128] - - [673, 4004.55] + - [676, 4004.55] - - [64, 2944, 1, 3328] - - [623, 5086.38] + - [626, 5086.38] - - [4288, 704, 1, 256] - - [688, 6176.47] + - [691, 6176.47] - - [256, 3584, 1, 128] - - [671, 2553.05] + - [674, 2553.05] - - [5888, 704, 1, 256] - - [687, 6781.41] + - [690, 6781.41] - - [3584, 1024, 1, 128] - - [674, 3660.85] + - [677, 3660.85] - - [256, 5888, 1, 3328] - - [690, 7772.03] + - [693, 7772.03] - - [1408, 4288, 1, 3328] - - [686, 8832.76] + - [689, 8832.76] - - [6784, 4288, 1, 256] - - [692, 8566.04] + - [695, 8566.04] - - [4288, 256, 1, 128] - - [672, 1953.69] + - [675, 1953.69] - - [5888, 256, 1, 256] - - [690, 3730.43] + - [693, 3730.43] - - [6784, 1024, 1, 1280] - - [686, 8578.29] + - [689, 8578.29] - - [5888, 1024, 1, 128] - - [671, 4092.86] + - [674, 4092.86] - - [1024, 128, 1, 256] - - [621, 1897.88] + - [624, 1897.88] - - [512, 16, 1, 500000] - - [592, 2363.69] + - [595, 2363.69] - - [128, 64, 1, 3328] - - [649, 1592.46] + - [652, 1592.46] - - [448, 64, 1, 256] - - [639, 976.068] + - [642, 976.068] - - [2368, 256, 1, 128] - - [674, 2094.89] + - [677, 2094.89] - - [6784, 3584, 1, 1280] - - [686, 8570.06] + - [689, 8570.06] - - [1024, 6784, 1, 1280] - - [692, 8203.47] + - [695, 8203.47] - - [2944, 64, 1, 1280] - - [630, 4300.51] + - [633, 4300.51] - - [1408, 2944, 1, 1280] - - [682, 7349.54] + - [685, 7349.54] - - [256, 1856, 1, 256] - - [681, 4649.65] + - [684, 4649.65] - - [2048, 800, 1, 2048] - - [700, 4668.63] + - [703, 4668.63] - - [1408, 2368, 1, 3328] - - [690, 7537.64] + - [693, 7537.64] - - [2944, 4, 1, 3328] - - [639, 514.042] + - [642, 514.042] - - [128, 1408, 1, 3328] - - [631, 4991.54] + - [634, 4991.54] - - [2944, 1856, 1, 128] - - [670, 4317.29] + - [673, 4317.29] - - [256, 2944, 1, 128] - - [670, 2258.17] + - [673, 2258.17] - - [256, 6784, 1, 128] - - [670, 3146.92] + - [673, 3146.92] - - [2368, 4, 1, 128] - - [702, 33.8286] + - [705, 33.8286] - - [1408, 256, 1, 3328] - - [681, 5077.75] + - [684, 5077.75] - - [1856, 4, 1, 128] - - [702, 21.4025] + - [705, 21.4025] - - [5056, 6784, 1, 128] - - [670, 4945.01] + - [673, 4945.01] - - [4288, 5056, 1, 128] - - [673, 4729.77] + - [676, 4729.77] - - [1856, 5888, 1, 128] - - [670, 4707.86] + - [673, 4707.86] - - [2944, 5888, 1, 256] - - [684, 8014.68] + - [687, 8014.68] - - [3584, 1856, 1, 256] - - [686, 7567.03] + - [689, 7567.03] - - [4288, 3584, 1, 1280] - - [679, 8726.33] + - [682, 8726.33] - - [2368, 448, 1, 256] - - [687, 4227.6] + - [690, 4227.6] - - [4288, 256, 1, 3328] - - [688, 5487.31] + - [691, 5487.31] - - [1856, 704, 1, 128] - - [674, 3124.96] + - [677, 3124.96] - - [1408, 64, 1, 256] - - [634, 1619.99] + - [637, 1619.99] - - [64, 1856, 1, 128] - - [598, 955.047] + - [601, 955.047] - - [4, 256, 1, 128] - - [701, 10.7789] + - [704, 10.7789] - - [2560, 16, 1, 2560] - - [646, 2019.6] + - [649, 2019.6] - - [704, 5888, 1, 128] - - [675, 3976.16] + - [678, 3976.16] - - [6784, 3584, 1, 128] - - [674, 4018.81] + - [677, 4018.81] - - [1024, 64, 1, 256] - - [639, 1370.69] + - [642, 1370.69] - - [64, 2368, 1, 256] - - [681, 2255.66] + - [684, 2255.66] - - [4288, 5056, 1, 3328] - - [686, 8368.59] + - [689, 8368.59] - - [4, 1856, 1, 1280] - - [639, 392.026] + - [642, 392.026] - - [4288, 128, 1, 128] - - [604, 2286.93] + - [607, 2286.93] - - [1408, 1408, 1, 128] - - [674, 3233.38] + - [677, 3233.38] - - [7680, 16, 1, 2560] - - [642, 2257.27] + - [645, 2257.27] - - [1856, 128, 1, 128] - - [604, 1532.7] + - [607, 1532.7] - - [5056, 2368, 1, 256] - - [686, 8167.19] + - [689, 8167.19] - - [4288, 704, 1, 3328] - - [692, 6411.06] + - [695, 6411.06] - - [448, 3584, 1, 256] - - [692, 5477.64] + - [695, 5477.64] - - [2368, 64, 1, 1280] - - [622, 3936.42] + - [625, 3936.42] - - [2368, 1024, 1, 1280] - - [688, 7688.72] + - [691, 7688.72] - - [2944, 1408, 1, 3328] - - [679, 7668.68] + - [682, 7668.68] - - [1408, 448, 1, 256] - - [681, 4863.88] + - [684, 4863.88] - - [1024, 1408, 1, 3328] - - [690, 7448.89] + - [693, 7448.89] - - [2944, 5888, 1, 1280] - - [680, 8208.47] + - [683, 8208.47] - - [1408, 4, 1, 1280] - - [619, 479.319] + - [622, 479.319] - - [5888, 3584, 1, 256] - - [680, 8609.99] + - [683, 8609.99] - - [2368, 5056, 1, 128] - - [677, 3726.15] + - [680, 3726.15] - - [1408, 1856, 1, 3328] - - [681, 7829.38] + - [684, 7829.38] - - [4, 4, 1, 3328] - - [708, 4.29419] + - [711, 4.29419] - - [6784, 1408, 1, 1280] - - [681, 7690.7] + - [684, 7690.7] - - [4096, 7000, 1, 4096] - - [693, 6272.39] + - [696, 6272.39] - - [704, 2944, 1, 256] - - [682, 6095.81] + - [685, 6095.81] - - [4288, 64, 1, 256] - - [647, 2121.21] + - [650, 2121.21] - - [6784, 5888, 1, 3328] - - [686, 8955.5] + - [689, 8955.5] - - [2368, 4288, 1, 128] - - [670, 4699.55] + - [673, 4699.55] - - [64, 4288, 1, 1280] - - [660, 4013.63] + - [663, 4013.63] - - [6784, 64, 1, 1280] - - [681, 5418.73] + - [684, 5418.73] - - [3584, 128, 1, 128] - - [610, 2165.2] + - [613, 2165.2] - - [1024, 6784, 1, 128] - - [671, 3765.2] + - [674, 3765.2] - - [4, 1856, 1, 128] - - [702, 33.2728] + - [705, 33.2728] - - [1408, 64, 1, 3328] - - [643, 4489.41] + - [646, 4489.41] - - [6784, 4, 1, 256] - - [639, 400.162] + - [642, 400.162] - - [1408, 1408, 1, 1280] - - [686, 8139.43] + - [689, 8139.43] - - [16384, 400, 1, 4096] - - [690, 6087.18] + - [693, 6087.18] - - [256, 2368, 1, 256] - - [681, 4766.25] + - [684, 4766.25] - - [448, 4288, 1, 3328] - - [688, 7576.98] + - [691, 7576.98] - - [2368, 1408, 1, 256] - - [684, 5284.43] + - [687, 5284.43] - - [5888, 5056, 1, 128] - - [671, 3643.5] + - [674, 3643.5] - - [704, 2368, 1, 256] - - [686, 5334.63] + - [689, 5334.63] - - [1024, 24000, 1, 2560] - - [694, 7437.96] + - [697, 7437.96] - - [2944, 448, 1, 1280] - - [695, 4937.43] + - [698, 4937.43] - - [5888, 2368, 1, 3328] - - [680, 8201.74] + - [683, 8201.74] - - [5124, 9124, 1, 1760] - - [687, 6763.96] + - [690, 6763.96] - - [448, 1408, 1, 1280] - - [681, 5881.44] + - [684, 5881.44] - - [448, 1856, 1, 1280] - - [688, 6225.46] + - [691, 6225.46] - - [4288, 448, 1, 1280] - - [690, 5626.27] + - [693, 5626.27] - - [5888, 704, 1, 3328] - - [684, 7873.52] + - [687, 7873.52] - - [5056, 256, 1, 128] - - [675, 2920.93] + - [678, 2920.93] - - [1856, 256, 1, 128] - - [677, 1995.32] + - [680, 1995.32] - - [64, 1408, 1, 128] - - [598, 758.838] + - [601, 758.838] - - [704, 4, 1, 256] - - [639, 130.597] + - [642, 130.597] - - [1408, 5888, 1, 128] - - [670, 4573.95] + - [673, 4573.95] - - [7680, 12000, 1, 2560] - - [686, 8747.03] + - [689, 8747.03] - - [1408, 1024, 1, 256] - - [683, 4609.13] + - [686, 4609.13] - - [8192, 400, 1, 2048] - - [695, 5283.15] + - [698, 5283.15] - - [1024, 1856, 1, 128] - - [670, 2686.28] + - [673, 2686.28] - - [256, 704, 1, 128] - - [670, 1004.73] + - [673, 1004.73] - - [2560, 128, 1, 2560] - - [648, 4259.04] + - [651, 4259.04] - - [448, 1024, 1, 256] - - [681, 4813.14] + - [684, 4813.14] - - [128, 4, 1, 3328] - - [707, 128.308] + - [710, 128.308] - - [5056, 6784, 1, 1280] - - [689, 6579.75] + - [692, 6579.75] - - [1408, 64, 1, 128] - - [611, 819.2] + - [614, 819.2] - - [1024, 448, 1, 1280] - - [690, 5703.21] + - [693, 5703.21] - - [704, 5056, 1, 3328] - - [682, 7574.39] + - [685, 7574.39] - - [128, 5056, 1, 256] - - [681, 5113.43] + - [684, 5113.43] - - [64, 1024, 1, 3328] - - [666, 3980.0] + - [669, 3980.0] - - [1856, 4, 1, 3328] - - [620, 433.153] + - [623, 433.153] - - [4, 2944, 1, 128] - - [702, 46.5225] + - [705, 46.5225] - - [2368, 2944, 1, 3328] - - [680, 9002.03] + - [683, 9002.03] - - [448, 448, 1, 1280] - - [622, 3969.42] + - [625, 3969.42] - - [2368, 3584, 1, 256] - - [692, 7806.29] + - [695, 7806.29] - - [5056, 3584, 1, 1280] - - [679, 8971.46] + - [682, 8971.46] - - [5124, 9124, 1, 4096] - - [692, 7208.62] + - [695, 7208.62] - - [7680, 48000, 1, 2560] - - [686, 3835.81] + - [689, 3835.81] - - [448, 4, 1, 3328] - - [707, 409.6] + - [710, 409.6] - - [1856, 2944, 1, 1280] - - [679, 7173.61] + - [682, 7173.61] - - [1024, 48000, 1, 2816] - - [686, 8976.16] + - [689, 8976.16] - - [128, 1024, 1, 256] - - [625, 1969.16] + - [628, 1969.16] - - [2944, 1408, 1, 256] - - [688, 4585.02] + - [691, 4585.02] - - [4288, 1408, 1, 3328] - - [682, 8237.17] + - [685, 8237.17] - - [3584, 64, 1, 3328] - - [628, 5183.06] + - [631, 5183.06] - - [5888, 2944, 1, 128] - - [677, 3674.46] + - [680, 3674.46] - - [2944, 1024, 1, 128] - - [674, 3834.22] + - [677, 3834.22] - - [4288, 5056, 1, 1280] - - [686, 8086.0] + - [689, 8086.0] - - [5888, 6784, 1, 1280] - - [680, 6941.22] + - [683, 6941.22] - - [6784, 5056, 1, 128] - - [671, 4860.05] + - [674, 4860.05] - - [256, 1024, 1, 3328] - - [695, 5156.12] + - [698, 5156.12] - - [3584, 4, 1, 256] - - [639, 332.429] + - [642, 332.429] - - [1760, 1600, 1, 1760] - - [682, 6330.66] + - [685, 6330.66] - - [1856, 64, 1, 3328] - - [643, 4755.93] + - [646, 4755.93] - - [4, 128, 1, 3328] - - [707, 160.144] + - [710, 160.144] - - [5888, 1408, 1, 3328] - - [680, 8722.64] + - [683, 8722.64] - - [448, 2944, 1, 128] - - [673, 2997.53] + - [676, 2997.53] - - [2368, 1856, 1, 256] - - [681, 6662.24] + - [684, 6662.24] - - [256, 5056, 1, 256] - - [683, 5256.19] + - [686, 5256.19] - - [128, 3584, 1, 128] - - [602, 2073.46] + - [605, 2073.46] - - [448, 3584, 1, 3328] - - [679, 6833.86] + - [682, 6833.86] - - [4, 5056, 1, 3328] - - [649, 581.423] + - [652, 581.423] - - [704, 2368, 1, 128] - - [670, 3402.19] + - [673, 3402.19] - - [5888, 256, 1, 128] - - [675, 2977.44] + - [678, 2977.44] - - [4, 5056, 1, 128] - - [701, 65.1074] + - [704, 65.1074] - - [448, 256, 1, 256] - - [687, 1764.43] + - [690, 1764.43] - - [704, 4, 1, 3328] - - [639, 398.454] + - [642, 398.454] - - [1408, 256, 1, 256] - - [682, 3463.76] + - [685, 3463.76] - - [3584, 1856, 1, 128] - - [678, 3228.09] + - [681, 3228.09] - - [4288, 4288, 1, 128] - - [674, 4853.83] + - [677, 4853.83] - - [1856, 1024, 1, 3328] - - [698, 5994.58] + - [701, 5994.58] - - [128, 5888, 1, 3328] - - [652, 6512.75] + - [655, 6512.75] - - [1024, 5056, 1, 256] - - [692, 7859.32] + - [695, 7859.32] - - [5888, 5888, 1, 1280] - - [692, 8131.34] + - [695, 8131.34] - - [5056, 5888, 1, 128] - - [671, 4920.61] + - [674, 4920.61] - - [2368, 1408, 1, 3328] - - [690, 7110.64] + - [693, 7110.64] - - [1024, 48000, 1, 1536] - - [690, 8590.72] + - [693, 8590.72] - - [5888, 448, 1, 256] - - [691, 3567.64] + - [694, 3567.64] - - [2560, 3200, 1, 2560] - - [681, 7638.21] + - [684, 7638.21] - - [5888, 6784, 1, 128] - - [671, 3910.82] + - [674, 3910.82] - - [6144, 48000, 1, 2048] - - [692, 3412.85] + - [695, 3412.85] - - [6784, 5056, 1, 1280] - - [683, 7890.12] + - [686, 7890.12] - - [5056, 704, 1, 1280] - - [687, 7664.96] + - [690, 7664.96] - - [1024, 48000, 1, 2560] - - [692, 8188.4] + - [695, 8188.4] - - [4608, 32, 1, 1536] - - [660, 2856.87] + - [663, 2856.87] - - [1024, 2368, 1, 128] - - [670, 3019.25] + - [673, 3019.25] - - [128, 704, 1, 256] - - [621, 1696.23] + - [624, 1696.23] - - [2368, 448, 1, 3328] - - [687, 5799.19] + - [690, 5799.19] - - [128, 5888, 1, 1280] - - [681, 6680.65] + - [684, 6680.65] - - [16384, 800, 1, 4096] - - [686, 6322.12] + - [689, 6322.12] - - [448, 128, 1, 1280] - - [660, 2849.39] + - [663, 2849.39] - - [6784, 4, 1, 3328] - - [639, 563.02] + - [642, 563.02] - - [5888, 5056, 1, 1280] - - [686, 8631.23] + - [689, 8631.23] - - [1024, 64, 1, 3328] - - [661, 3481.86] + - [664, 3481.86] - - [3072, 48000, 1, 1024] - - [686, 9019.39] + - [689, 9019.39] - - [64, 3584, 1, 1280] - - [623, 4327.85] + - [626, 4327.85] - - [6784, 1408, 1, 256] - - [686, 6320.49] + - [689, 6320.49] - - [3584, 5888, 1, 128] - - [673, 4406.69] + - [676, 4406.69] - - [5056, 5888, 1, 256] - - [692, 8037.03] + - [695, 8037.03] - - [2368, 1024, 1, 256] - - [684, 4936.04] + - [687, 4936.04] - - [2944, 1856, 1, 256] - - [692, 7222.22] + - [695, 7222.22] - - [1856, 6784, 1, 1280] - - [682, 8251.71] + - [685, 8251.71] - - [64, 5056, 1, 128] - - [602, 1643.6] + - [605, 1643.6] - - [64, 6784, 1, 128] - - [600, 1929.67] + - [603, 1929.67] - - [448, 704, 1, 128] - - [672, 979.859] + - [675, 979.859] - - [4, 1024, 1, 128] - - [701, 20.0416] + - [704, 20.0416] - - [4288, 3584, 1, 256] - - [686, 8444.04] + - [689, 8444.04] - - [1408, 704, 1, 128] - - [670, 3020.9] + - [673, 3020.9] - - [64, 256, 1, 3328] - - [666, 2227.37] + - [669, 2227.37] - - [6784, 448, 1, 3328] - - [692, 6573.01] + - [695, 6573.01] - - [5056, 1856, 1, 1280] - - [684, 7976.13] + - [687, 7976.13] - - [1408, 1024, 1, 3328] - - [682, 7470.23] + - [685, 7470.23] - - [2368, 256, 1, 3328] - - [687, 5394.27] + - [690, 5394.27] - - [5888, 3584, 1, 1280] - - [679, 9031.45] + - [682, 9031.45] - - [1856, 3584, 1, 3328] - - [694, 7272.5] + - [697, 7272.5] - - [5888, 128, 1, 1280] - - [687, 6684.38] + - [690, 6684.38] - - [1024, 2944, 1, 256] - - [692, 7414.99] + - [695, 7414.99] - - [448, 6784, 1, 1280] - - [688, 7923.68] + - [691, 7923.68] - - [256, 3584, 1, 1280] - - [684, 6901.77] + - [687, 6901.77] - - [704, 5056, 1, 256] - - [689, 5004.45] + - [692, 5004.45] - - [3584, 1024, 1, 3328] - - [681, 7894.53] + - [684, 7894.53] - - [2944, 1856, 1, 1280] - - [686, 7903.17] + - [689, 7903.17] - - [128, 256, 1, 128] - - [599, 325.645] + - [602, 325.645] - - [5056, 256, 1, 256] - - [683, 3356.46] + - [686, 3356.46] - - [2944, 4288, 1, 3328] - - [692, 7813.83] + - [695, 7813.83] - - [2368, 3584, 1, 3328] - - [692, 8370.99] + - [695, 8370.99] - - [2944, 704, 1, 1280] - - [698, 5513.99] + - [701, 5513.99] - - [128, 4, 1, 256] - - [639, 25.2062] + - [642, 25.2062] - - [2944, 3584, 1, 1280] - - [686, 7738.73] + - [689, 7738.73] - - [1856, 5888, 1, 1280] - - [680, 8584.53] + - [683, 8584.53] - - [256, 256, 1, 1280] - - [660, 2962.08] + - [663, 2962.08] - - [2048, 3200, 1, 2048] - - [688, 6911.59] + - [691, 6911.59] - - [4288, 1408, 1, 256] - - [686, 7953.9] + - [689, 7953.9] - - [3584, 64, 1, 256] - - [687, 2780.32] + - [690, 2780.32] - - [64, 1856, 1, 3328] - - [622, 4911.94] + - [625, 4911.94] - - [256, 1408, 1, 128] - - [670, 1373.14] + - [673, 1373.14] - - [5888, 1408, 1, 128] - - [675, 4241.91] + - [678, 4241.91] - - [4288, 2368, 1, 1280] - - [684, 8012.6] + - [687, 8012.6] - - [4, 4288, 1, 256] - - [705, 301.574] + - [708, 301.574] - - [256, 4288, 1, 128] - - [670, 2706.26] + - [673, 2706.26] - - [2048, 128, 1, 2048] - - [665, 2885.16] + - [668, 2885.16] - - [256, 128, 1, 3328] - - [667, 3170.11] + - [670, 3170.11] - - [512, 8, 1, 500000] - - [591, 1915.02] + - [594, 1915.02] - - [6784, 2368, 1, 256] - - [686, 8323.56] + - [689, 8323.56] - - [5888, 128, 1, 128] - - [674, 2465.98] + - [677, 2465.98] - - [1024, 24000, 1, 2816] - - [684, 8131.54] + - [687, 8131.54] - - [7680, 5984, 1, 2560] - - [688, 6040.67] + - [691, 6040.67] - - [4288, 1856, 1, 256] - - [700, 5818.43] + - [703, 5818.43] - - [1856, 256, 1, 3328] - - [681, 6531.93] + - [684, 6531.93] - - [1856, 2944, 1, 256] - - [686, 7312.82] + - [689, 7312.82] - - [5056, 1024, 1, 128] - - [676, 4102.9] + - [679, 4102.9] - - [64, 5888, 1, 1280] - - [681, 5058.15] + - [684, 5058.15] - - [1760, 800, 1, 1760] - - [684, 7279.9] + - [687, 7279.9] - - [6784, 256, 1, 128] - - [674, 3257.59] + - [677, 3257.59] - - [5888, 704, 1, 128] - - [670, 3813.83] + - [673, 3813.83] - - [1408, 2368, 1, 128] - - [671, 3561.17] + - [674, 3561.17] - - [1024, 4288, 1, 1280] - - [690, 7752.64] + - [693, 7752.64] - - [2368, 5056, 1, 3328] - - [693, 7711.81] + - [696, 7711.81] - - [448, 4, 1, 128] - - [701, 18.3795] + - [704, 18.3795] - - [4, 256, 1, 3328] - - [708, 269.61] + - [711, 269.61] - - [4288, 1024, 1, 3328] - - [687, 7910.17] + - [690, 7910.17] - - [6144, 48000, 1, 2560] - - [686, 3540.99] + - [689, 3540.99] - - [1024, 5056, 1, 3328] - - [680, 8509.56] + - [683, 8509.56] - - [1024, 1856, 1, 3328] - - [686, 7907.83] + - [689, 7907.83] - - [704, 704, 1, 1280] - - [698, 5648.05] + - [701, 5648.05] - - [128, 2368, 1, 1280] - - [657, 4145.01] + - [660, 4145.01] - - [1408, 128, 1, 3328] - - [630, 4919.5] + - [633, 4919.5] - - [3584, 256, 1, 1280] - - [682, 5185.46] + - [685, 5185.46] - - [4, 128, 1, 128] - - [701, 2.97891] + - [704, 2.97891] - - [5888, 64, 1, 1280] - - [630, 4499.49] + - [633, 4499.49] - - [3584, 128, 1, 1280] - - [687, 5928.91] + - [690, 5928.91] - - [4, 256, 1, 1280] - - [706, 170.667] + - [709, 170.667] - - [128, 704, 1, 3328] - - [630, 4379.27] + - [633, 4379.27] - - [4288, 6784, 1, 256] - - [680, 7180.99] + - [683, 7180.99] - - [3584, 2944, 1, 3328] - - [686, 8553.2] + - [689, 8553.2] - - [128, 1856, 1, 256] - - [687, 3207.67] + - [690, 3207.67] - - [64, 4288, 1, 256] - - [681, 2907.89] + - [684, 2907.89] - - [4, 3584, 1, 3328] - - [639, 560.505] + - [642, 560.505] - - [64, 4, 1, 3328] - - [708, 67.4025] + - [711, 67.4025] - - [4, 64, 1, 3328] - - [708, 88.7467] + - [711, 88.7467] - - [5888, 2944, 1, 256] - - [686, 7255.67] + - [689, 7255.67] - - [1856, 64, 1, 256] - - [632, 1743.62] + - [635, 1743.62] - - [5056, 128, 1, 1280] - - [687, 6009.69] + - [690, 6009.69] - - [448, 4288, 1, 1280] - - [688, 6466.72] + - [691, 6466.72] - - [448, 1856, 1, 3328] - - [688, 6381.89] + - [691, 6381.89] - - [1024, 4288, 1, 128] - - [673, 3491.77] + - [676, 3491.77] - - [4, 1024, 1, 256] - - [706, 172.463] + - [709, 172.463] - - [5056, 4288, 1, 256] - - [686, 8241.42] + - [689, 8241.42] - - [1024, 448, 1, 256] - - [690, 4218.41] + - [693, 4218.41] - - [1024, 3584, 1, 256] - - [686, 6513.59] + - [689, 6513.59] - - [2944, 128, 1, 1280] - - [630, 4710.38] + - [633, 4710.38] - - [2048, 32, 1, 2048] - - [645, 1779.13] + - [648, 1779.13] - - [64, 256, 1, 256] - - [639, 655.36] + - [642, 655.36] - - [1408, 4, 1, 128] - - [702, 20.0249] + - [705, 20.0249] - - [128, 2368, 1, 128] - - [602, 1707.63] + - [605, 1707.63] - - [256, 704, 1, 1280] - - [681, 3735.21] + - [684, 3735.21] - - [64, 2368, 1, 128] - - [609, 1049.71] + - [612, 1049.71] - - [6784, 6784, 1, 3328] - - [686, 9277.84] + - [689, 9277.84] - - [448, 5888, 1, 1280] - - [692, 7319.65] + - [695, 7319.65] - - [5056, 448, 1, 128] - - [674, 3694.33] + - [677, 3694.33] - - [4288, 704, 1, 1280] - - [684, 7890.86] + - [687, 7890.86] - - [3584, 2944, 1, 128] - - [676, 4124.61] + - [679, 4124.61] - - [6784, 256, 1, 1280] - - [692, 7185.73] + - [695, 7185.73] - - [256, 2944, 1, 1280] - - [681, 6736.66] + - [684, 6736.66] - - [64, 4288, 1, 128] - - [600, 1614.31] + - [603, 1614.31] - - [2368, 5888, 1, 3328] - - [682, 8616.36] + - [685, 8616.36] - - [4, 64, 1, 256] - - [619, 11.3778] + - [622, 11.3778] - - [704, 1024, 1, 3328] - - [687, 6801.82] + - [690, 6801.82] - - [2368, 1856, 1, 1280] - - [684, 7853.47] + - [687, 7853.47] - - [448, 5056, 1, 3328] - - [687, 7452.94] + - [690, 7452.94] - - [128, 448, 1, 128] - - [602, 530.349] + - [605, 530.349] - - [128, 6784, 1, 256] - - [682, 5557.45] + - [685, 5557.45] - - [3584, 4288, 1, 128] - - [673, 4462.63] + - [676, 4462.63] - - [64, 448, 1, 128] - - [602, 278.032] + - [605, 278.032] - - [5888, 4288, 1, 3328] - - [679, 9153.45] + - [682, 9153.45] - - [2368, 704, 1, 256] - - [686, 5350.68] + - [689, 5350.68] - - [256, 1856, 1, 3328] - - [681, 6536.25] + - [684, 6536.25] - - [1856, 128, 1, 256] - - [695, 2847.26] + - [698, 2847.26] - - [6784, 128, 1, 128] - - [675, 2530.72] + - [678, 2530.72] - - [3584, 1408, 1, 128] - - [676, 3625.52] + - [679, 3625.52] - - [1856, 5056, 1, 1280] - - [682, 8123.29] + - [685, 8123.29] - - [2944, 1024, 1, 1280] - - [692, 8450.31] + - [695, 8450.31] - - [5056, 4, 1, 256] - - [706, 380.687] + - [709, 380.687] - - [3584, 5888, 1, 3328] - - [684, 8567.89] + - [687, 8567.89] - - [2368, 4288, 1, 256] - - [688, 7857.97] + - [691, 7857.97] - - [1024, 2368, 1, 3328] - - [682, 6776.35] + - [685, 6776.35] - - [64, 704, 1, 3328] - - [637, 3503.42] + - [640, 3503.42] - - [704, 1408, 1, 256] - - [682, 6099.89] + - [685, 6099.89] - - [4096, 128, 1, 4096] - - [662, 4116.47] + - [665, 4116.47] - - [1024, 3584, 1, 1280] - - [692, 7231.55] + - [695, 7231.55] - - [4288, 5888, 1, 3328] - - [686, 8762.32] + - [689, 8762.32] - - [4288, 4, 1, 1280] - - [639, 492.697] + - [642, 492.697] - - [4608, 16, 1, 1536] - - [640, 1892.48] + - [643, 1892.48] - - [5888, 64, 1, 128] - - [617, 1747.63] + - [620, 1747.63] - - [4, 5888, 1, 128] - - [702, 84.4915] + - [705, 84.4915] - - [1024, 2944, 1, 3328] - - [690, 6906.95] + - [693, 6906.95] - - [6784, 1856, 1, 256] - - [686, 6273.97] + - [689, 6273.97] - - [2048, 64, 1, 2048] - - [669, 2371.34] + - [672, 2371.34] - - [256, 6784, 1, 1280] - - [686, 7066.94] + - [689, 7066.94] - - [1856, 3584, 1, 256] - - [692, 7706.77] + - [695, 7706.77] - - [128, 448, 1, 3328] - - [637, 3995.83] + - [640, 3995.83] - - [6784, 1856, 1, 128] - - [674, 4458.99] + - [677, 4458.99] - - [4, 448, 1, 256] - - [639, 84.3294] + - [642, 84.3294] - - [5056, 128, 1, 256] - - [687, 4954.4] + - [690, 4954.4] - - [512, 24000, 1, 2816] - - [680, 8994.88] + - [683, 8994.88] - - [256, 5888, 1, 1280] - - [679, 6183.9] + - [682, 6183.9] - - [4, 128, 1, 1280] - - [707, 71.8597] + - [710, 71.8597] - - [16384, 1600, 1, 4096] - - [686, 6920.99] + - [689, 6920.99] - - [6784, 128, 1, 1280] - - [690, 6486.27] + - [693, 6486.27] - - [64, 1408, 1, 256] - - [627, 1647.76] + - [630, 1647.76] - - [2368, 1408, 1, 128] - - [674, 3937.0] + - [677, 3937.0] - - [1856, 448, 1, 256] - - [687, 4635.47] + - [690, 4635.47] - - [1408, 1024, 1, 128] - - [670, 3208.41] + - [673, 3208.41] - - [128, 64, 1, 128] - - [599, 70.092] + - [602, 70.092] - - [6784, 3584, 1, 3328] - - [692, 8466.18] + - [695, 8466.18] - - [1760, 7000, 1, 1760] - - [690, 8149.11] + - [693, 8149.11] - - [2944, 64, 1, 3328] - - [623, 5017.99] + - [626, 5017.99] - - [64, 64, 1, 128] - - [599, 35.4249] + - [602, 35.4249] - - [2368, 5056, 1, 1280] - - [686, 8763.9] + - [689, 8763.9] - - [64, 4, 1, 1280] - - [708, 43.5745] + - [711, 43.5745] - - [1408, 2368, 1, 1280] - - [687, 7660.28] + - [690, 7660.28] - - [128, 1408, 1, 1280] - - [622, 4185.17] + - [625, 4185.17] - - [256, 64, 1, 3328] - - [647, 2071.65] + - [650, 2071.65] - - [704, 4288, 1, 128] - - [670, 4069.08] + - [673, 4069.08] - - [128, 1856, 1, 3328] - - [653, 5776.05] + - [656, 5776.05] - - [2944, 2944, 1, 256] - - [692, 7949.21] + - [695, 7949.21] - - [2944, 4, 1, 1280] - - [639, 483.118] + - [642, 483.118] - - [5888, 4, 1, 256] - - [624, 396.665] + - [627, 396.665] - - [6784, 256, 1, 256] - - [698, 4044.73] + - [701, 4044.73] - - [256, 5056, 1, 3328] - - [681, 7607.27] + - [684, 7607.27] - - [128, 4288, 1, 1280] - - [622, 4958.68] + - [625, 4958.68] - - [5056, 1856, 1, 128] - - [674, 4560.84] + - [677, 4560.84] - - [5056, 1024, 1, 3328] - - [686, 8634.08] + - [689, 8634.08] - - [128, 128, 1, 256] - - [624, 699.051] + - [627, 699.051] - - [1760, 64, 1, 1760] - - [630, 4580.55] + - [633, 4580.55] - - [4288, 3584, 1, 3328] - - [692, 9143.66] + - [695, 9143.66] - - [448, 704, 1, 3328] - - [681, 4473.33] + - [684, 4473.33] - - [448, 448, 1, 128] - - [612, 1264.28] + - [615, 1264.28] - - [1024, 2368, 1, 1280] - - [690, 7452.41] + - [693, 7452.41] - - [1856, 704, 1, 3328] - - [681, 6103.24] + - [684, 6103.24] - - [4, 2368, 1, 128] - - [701, 95.919] + - [704, 95.919] - - [5888, 6784, 1, 3328] - - [686, 9131.64] + - [689, 9131.64] - - [704, 4288, 1, 1280] - - [688, 7906.36] + - [691, 7906.36] - - [704, 256, 1, 256] - - [681, 2772.68] + - [684, 2772.68] - - [1024, 48000, 1, 2048] - - [685, 6513.35] + - [688, 6513.35] - - [4288, 1024, 1, 128] - - [670, 4291.67] + - [673, 4291.67] - - [256, 64, 1, 3136] - - [711, 3015.27] + - [714, 3015.27] - - [256, 1024, 1, 196] - - [715, 4225.35] + - [718, 4225.35] - - [1024, 1024, 1, 3328] - - [827, 8705.0] + - [830, 8705.0] - - [2048, 200, 1, 3200] - - [832, 6173.32] + - [835, 6173.32] - - [1024, 200, 1, 13312] - - [730, 5213.21] + - [733, 5213.21] - - [1024, 256, 1, 1536] - - [832, 5859.33] + - [835, 5859.33] - - [4096, 256, 1, 12288] - - [837, 8807.42] + - [840, 8807.42] - - [64, 200, 1, 1024] - - [804, 366.532] + - [807, 366.532] - - [32, 512, 1, 1024] - - [759, 452.949] + - [762, 452.949] - - [2048, 256, 1, 3328] - - [821, 7876.63] + - [824, 7876.63] - - [4096, 512, 1, 32] - - [825, 3975.64] + - [828, 3975.64] - - [2048, 256, 1, 13312] - - [802, 7837.71] + - [805, 7837.71] - - [4096, 200, 1, 11264] - - [837, 6902.66] + - [840, 6902.66] - - [2048, 512, 1, 1024] - - [831, 8100.04] + - [834, 8100.04] - - [2048, 1024, 1, 1664] - - [731, 9081.98] + - [734, 9081.98] - - [1024, 1024, 1, 64] - - [827, 4258.18] + - [830, 4258.18] - - [512, 1024, 1, 1536] - - [821, 7597.23] + - [824, 7597.23] - - [1024, 256, 1, 15360] - - [722, 6735.14] + - [725, 6735.14] - - [1, 512, 1, 1024] - - [772, 15.0657] + - [775, 15.0657] - - [4096, 512, 1, 1408] - - [734, 9024.42] + - [737, 9024.42] - - [1024, 200, 1, 1408] - - [832, 4460.99] + - [835, 4460.99] - - [1024, 512, 1, 512] - - [826, 6528.1] + - [829, 6528.1] - - [4096, 256, 1, 15360] - - [833, 8823.93] + - [836, 8823.93] - - [2048, 512, 1, 640] - - [823, 7989.15] + - [826, 7989.15] - - [4096, 1024, 1, 1280] - - [729, 9421.44] + - [732, 9421.44] - - [1024, 200, 1, 6144] - - [821, 4966.42] + - [824, 4966.42] - - [1024, 1024, 1, 512] - - [823, 7731.44] + - [826, 7731.44] - - [128, 512, 1, 2048] - - [739, 2190.24] + - [742, 2190.24] - - [2048, 1024, 1, 640] - - [729, 8581.7] + - [732, 8581.7] - - [1024, 256, 1, 3328] - - [821, 6192.61] + - [824, 6192.61] - - [4096, 1024, 1, 13312] - - [734, 9642.49] + - [737, 9642.49] - - [2048, 256, 1, 2048] - - [821, 7485.65] + - [824, 7485.65] - - [2048, 1024, 1, 13312] - - [734, 9352.16] + - [737, 9352.16] - - [2048, 512, 1, 16640] - - [822, 8839.07] + - [825, 8839.07] - - [1024, 512, 1, 128] - - [826, 4279.9] + - [829, 4279.9] - - [2048, 1024, 1, 3584] - - [729, 9264.62] + - [732, 9264.62] - - [2048, 512, 1, 256] - - [837, 6990.51] + - [840, 6990.51] - - [512, 256, 1, 3200] - - [784, 4154.42] + - [787, 4154.42] - - [4096, 1024, 1, 1920] - - [729, 9535.22] + - [732, 9535.22] - - [4096, 200, 1, 2560] - - [834, 6754.55] + - [837, 6754.55] - - [1024, 256, 1, 16384] - - [724, 6289.5] + - [727, 6289.5] - - [1024, 1024, 1, 1152] - - [827, 8407.29] + - [830, 8407.29] - - [2048, 200, 1, 32] - - [770, 1412.41] + - [773, 1412.41] - - [512, 1024, 1, 2816] - - [821, 7843.15] + - [824, 7843.15] - - [4096, 256, 1, 14336] - - [833, 8844.67] + - [836, 8844.67] - - [1024, 200, 1, 4608] - - [832, 4931.64] + - [835, 4931.64] - - [1024, 200, 1, 16384] - - [727, 5135.05] + - [730, 5135.05] - - [64, 256, 1, 1024] - - [805, 460.913] + - [808, 460.913] - - [1, 200, 1, 1024] - - [787, 7.39884] + - [790, 7.39884] - - [2048, 200, 1, 2080] - - [832, 6033.77] + - [835, 6033.77] - - [512, 256, 1, 1792] - - [742, 3153.61] + - [745, 3153.61] - - [2048, 200, 1, 1024] - - [832, 5711.2] + - [835, 5711.2] - - [4096, 1024, 1, 12288] - - [729, 9658.13] + - [732, 9658.13] - - [4096, 200, 1, 4096] - - [823, 6834.45] + - [826, 6834.45] - - [1024, 512, 1, 11264] - - [790, 7686.36] + - [793, 7686.36] - - [128, 512, 1, 1024] - - [760, 1458.89] + - [763, 1458.89] - - [32, 256, 1, 2048] - - [778, 384.799] + - [781, 384.799] - - [1024, 200, 1, 1792] - - [832, 4638.54] + - [835, 4638.54] - - [1024, 1024, 1, 1792] - - [827, 8550.46] + - [830, 8550.46] - - [32, 256, 1, 512] - - [811, 161.319] + - [814, 161.319] - - [512, 200, 1, 2816] - - [737, 3353.0] + - [740, 3353.0] - - [512, 200, 1, 3072] - - [722, 3298.79] + - [725, 3298.79] - - [1024, 1024, 1, 8192] - - [768, 8369.0] + - [771, 8369.0] - - [1024, 256, 1, 12288] - - [725, 6475.61] + - [728, 6475.61] - - [4096, 200, 1, 768] - - [827, 6367.87] + - [830, 6367.87] - - [1024, 512, 1, 16384] - - [843, 7367.02] + - [846, 7367.02] - - [4096, 256, 1, 1024] - - [823, 8214.06] + - [826, 8214.06] - - [1024, 512, 1, 256] - - [826, 5537.03] + - [829, 5537.03] - - [4096, 1024, 1, 8320] - - [729, 9674.16] + - [732, 9674.16] - - [4096, 256, 1, 9216] - - [831, 8790.92] + - [834, 8790.92] - - [1024, 512, 1, 1408] - - [821, 7459.55] + - [824, 7459.55] - - [1024, 512, 1, 5632] - - [832, 7997.81] + - [835, 7997.81] - - [4096, 200, 1, 256] - - [837, 5371.8] + - [840, 5371.8] - - [1024, 200, 1, 128] - - [815, 1998.05] + - [818, 1998.05] - - [256, 200, 1, 1024] - - [784, 1195.91] + - [787, 1195.91] - - [1024, 200, 1, 5120] - - [832, 4957.34] + - [835, 4957.34] - - [512, 1024, 1, 3072] - - [845, 7103.97] + - [848, 7103.97] - - [4096, 1024, 1, 15360] - - [729, 9668.94] + - [732, 9668.94] - - [1, 256, 1, 2048] - - [771, 13.8262] + - [774, 13.8262] - - [1024, 1024, 1, 4160] - - [823, 8759.2] + - [826, 8759.2] - - [1024, 256, 1, 256] - - [830, 3728.27] + - [833, 3728.27] - - [2048, 256, 1, 384] - - [832, 6123.07] + - [835, 6123.07] - - [512, 256, 1, 2560] - - [786, 3809.54] + - [789, 3809.54] - - [4096, 512, 1, 3072] - - [734, 9215.09] + - [737, 9215.09] - - [1024, 256, 1, 4160] - - [821, 6293.39] + - [824, 6293.39] - - [4096, 512, 1, 13312] - - [731, 9367.22] + - [734, 9367.22] - - [4096, 1024, 1, 3840] - - [729, 9631.47] + - [732, 9631.47] - - [4096, 200, 1, 640] - - [827, 6206.06] + - [830, 6206.06] - - [32, 200, 1, 2048] - - [765, 303.407] + - [768, 303.407] - - [1024, 200, 1, 512] - - [821, 3713.09] + - [824, 3713.09] - - [1024, 1024, 1, 7168] - - [824, 8475.64] + - [827, 8475.64] - - [2048, 1024, 1, 3200] - - [729, 9271.24] + - [732, 9271.24] - - [512, 512, 1, 1536] - - [832, 5832.17] + - [835, 5832.17] - - [4096, 256, 1, 768] - - [837, 8065.97] + - [840, 8065.97] - - [2048, 256, 1, 6656] - - [821, 8034.77] + - [824, 8034.77] - - [1024, 256, 1, 896] - - [821, 5467.44] + - [824, 5467.44] - - [2048, 256, 1, 512] - - [832, 6465.21] + - [835, 6465.21] - - [2048, 200, 1, 3072] - - [832, 6165.68] + - [835, 6165.68] - - [128, 200, 1, 1024] - - [789, 692.77] + - [792, 692.77] - - [4096, 512, 1, 3840] - - [734, 9272.6] + - [737, 9272.6] - - [1024, 200, 1, 3200] - - [832, 4838.75] + - [835, 4838.75] - - [4096, 512, 1, 5632] - - [729, 9335.42] + - [732, 9335.42] - - [4096, 512, 1, 64] - - [764, 5275.85] + - [767, 5275.85] - - [1024, 512, 1, 2816] - - [821, 7816.58] + - [824, 7816.58] - - [4096, 256, 1, 7680] - - [827, 8795.4] + - [830, 8795.4] - - [4096, 200, 1, 1024] - - [837, 6448.81] + - [840, 6448.81] - - [1024, 512, 1, 12288] - - [791, 7624.57] + - [794, 7624.57] - - [2048, 1024, 1, 512] - - [734, 8436.06] + - [737, 8436.06] - - [128, 256, 1, 2048] - - [808, 1342.18] + - [811, 1342.18] - - [2048, 200, 1, 1792] - - [832, 6020.37] + - [835, 6020.37] - - [1024, 1024, 1, 2816] - - [823, 8670.4] + - [826, 8670.4] - - [2048, 512, 1, 1536] - - [834, 8466.22] + - [837, 8466.22] - - [4096, 256, 1, 3072] - - [831, 8631.37] + - [834, 8631.37] - - [1024, 200, 1, 1536] - - [813, 4577.6] + - [816, 4577.6] - - [1024, 256, 1, 1024] - - [821, 5491.72] + - [824, 5491.72] - - [4096, 512, 1, 8192] - - [734, 9325.54] + - [737, 9325.54] - - [128, 1024, 1, 512] - - [832, 2534.32] + - [835, 2534.32] - - [4096, 512, 1, 2304] - - [729, 9192.99] + - [732, 9192.99] - - [2048, 256, 1, 5632] - - [832, 7999.54] + - [835, 7999.54] - - [1024, 256, 1, 5120] - - [832, 6307.22] + - [835, 6307.22] - - [1024, 512, 1, 6656] - - [832, 8028.85] + - [835, 8028.85] - - [4096, 512, 1, 2816] - - [729, 9234.4] + - [732, 9234.4] - - [4096, 200, 1, 2080] - - [816, 6697.86] + - [819, 6697.86] - - [1024, 200, 1, 2304] - - [832, 4752.81] + - [835, 4752.81] - - [2048, 200, 1, 13312] - - [821, 6346.13] + - [824, 6346.13] - - [64, 1024, 1, 1024] - - [805, 1359.58] + - [808, 1359.58] - - [4096, 256, 1, 3584] - - [827, 8668.8] + - [830, 8668.8] - - [2048, 1024, 1, 7680] - - [729, 9365.78] + - [732, 9365.78] - - [1024, 256, 1, 1664] - - [821, 5907.47] + - [824, 5907.47] - - [1, 512, 1, 2048] - - [748, 23.4057] + - [751, 23.4057] - - [512, 512, 1, 1024] - - [821, 5360.13] + - [824, 5360.13] - - [2048, 256, 1, 8192] - - [793, 7665.21] + - [796, 7665.21] - - [2048, 512, 1, 512] - - [823, 7767.23] + - [826, 7767.23] - - [4096, 512, 1, 1920] - - [729, 9132.94] + - [732, 9132.94] - - [4096, 200, 1, 12288] - - [837, 6910.65] + - [840, 6910.65] - - [1024, 512, 1, 3072] - - [767, 7310.33] + - [770, 7310.33] - - [2048, 512, 1, 1152] - - [827, 8342.26] + - [830, 8342.26] - - [1024, 256, 1, 2080] - - [821, 6010.36] + - [824, 6010.36] - - [4096, 1024, 1, 32] - - [817, 4793.49] + - [820, 4793.49] - - [4096, 512, 1, 16640] - - [729, 9365.31] + - [732, 9365.31] - - [2048, 200, 1, 9216] - - [821, 6315.88] + - [824, 6315.88] - - [2048, 200, 1, 2560] - - [821, 6119.14] + - [824, 6119.14] - - [2048, 1024, 1, 1024] - - [729, 8628.59] + - [732, 8628.59] - - [2048, 256, 1, 4608] - - [821, 7951.29] + - [824, 7951.29] - - [512, 200, 1, 768] - - [773, 2132.41] + - [776, 2132.41] - - [128, 256, 1, 512] - - [773, 670.017] + - [776, 670.017] - - [4096, 512, 1, 1792] - - [734, 9126.91] + - [737, 9126.91] - - [4096, 1024, 1, 8192] - - [729, 9591.27] + - [732, 9591.27] - - [1024, 256, 1, 2816] - - [832, 6119.01] + - [835, 6119.01] - - [1024, 1024, 1, 13312] - - [824, 8529.27] + - [827, 8529.27] - - [2048, 1024, 1, 4160] - - [729, 9305.57] + - [732, 9305.57] - - [2048, 256, 1, 3584] - - [821, 7903.13] + - [824, 7903.13] - - [128, 200, 1, 2048] - - [789, 1135.81] + - [792, 1135.81] - - [4096, 512, 1, 10240] - - [731, 9339.49] + - [734, 9339.49] - - [4096, 512, 1, 512] - - [729, 8446.68] + - [732, 8446.68] - - [2048, 1024, 1, 6656] - - [729, 9331.65] + - [732, 9331.65] - - [1024, 512, 1, 640] - - [821, 6775.94] + - [824, 6775.94] - - [2048, 512, 1, 768] - - [823, 8085.41] + - [826, 8085.41] - - [2048, 200, 1, 1408] - - [821, 5880.07] + - [824, 5880.07] - - [4096, 200, 1, 2048] - - [837, 6691.61] + - [840, 6691.61] - - [1024, 1024, 1, 5632] - - [823, 8749.53] + - [826, 8749.53] - - [2048, 512, 1, 3584] - - [827, 8704.13] + - [830, 8704.13] - - [64, 512, 1, 512] - - [763, 667.883] + - [766, 667.883] - - [64, 200, 1, 512] - - [773, 251.288] + - [776, 251.288] - - [1024, 200, 1, 64] - - [728, 1310.72] + - [731, 1310.72] - - [512, 512, 1, 2304] - - [821, 6078.7] + - [824, 6078.7] - - [2048, 1024, 1, 14336] - - [729, 9321.84] + - [732, 9321.84] - - [4096, 512, 1, 11264] - - [731, 9339.85] + - [734, 9339.85] - - [4096, 512, 1, 128] - - [816, 6566.43] + - [819, 6566.43] - - [1024, 512, 1, 64] - - [836, 2953.74] + - [839, 2953.74] - - [4096, 512, 1, 768] - - [729, 8738.13] + - [732, 8738.13] - - [4096, 1024, 1, 11264] - - [729, 9637.68] + - [732, 9637.68] - - [1, 256, 1, 1024] - - [819, 8.83234] + - [822, 8.83234] - - [4096, 200, 1, 7680] - - [816, 6889.47] + - [819, 6889.47] - - [1024, 200, 1, 12288] - - [788, 5237.64] + - [791, 5237.64] - - [1024, 1024, 1, 1280] - - [823, 8418.07] + - [826, 8418.07] - - [4096, 1024, 1, 16640] - - [729, 9674.91] + - [732, 9674.91] - - [2048, 1024, 1, 5632] - - [729, 9327.75] + - [732, 9327.75] - - [1024, 200, 1, 15360] - - [788, 5386.53] + - [791, 5386.53] - - [1, 1024, 1, 1024] - - [838, 27.2499] + - [841, 27.2499] - - [2048, 256, 1, 16384] - - [799, 7652.65] + - [802, 7652.65] - - [4096, 512, 1, 12288] - - [731, 9359.41] + - [734, 9359.41] - - [2048, 200, 1, 896] - - [832, 5628.86] + - [835, 5628.86] - - [4096, 1024, 1, 5632] - - [729, 9626.68] + - [732, 9626.68] - - [2048, 256, 1, 32] - - [825, 1889.33] + - [828, 1889.33] - - [2048, 256, 1, 1280] - - [821, 7390.84] + - [824, 7390.84] - - [4096, 256, 1, 4096] - - [823, 8694.27] + - [826, 8694.27] - - [2048, 256, 1, 11264] - - [821, 8113.85] + - [824, 8113.85] - - [4096, 200, 1, 9216] - - [823, 6890.98] + - [826, 6890.98] - - [1024, 512, 1, 4096] - - [769, 7348.36] + - [772, 7348.36] - - [2048, 1024, 1, 10240] - - [731, 9095.81] + - [734, 9095.81] - - [4096, 1024, 1, 640] - - [729, 9115.58] + - [732, 9115.58] - - [128, 1024, 1, 2048] - - [722, 3270.41] + - [725, 3270.41] - - [4096, 200, 1, 3840] - - [816, 6836.16] + - [819, 6836.16] - - [1024, 1024, 1, 1920] - - [827, 8562.72] + - [830, 8562.72] - - [2048, 200, 1, 7168] - - [832, 6296.13] + - [835, 6296.13] - - [2048, 512, 1, 16384] - - [723, 8632.41] + - [726, 8632.41] - - [2048, 1024, 1, 12288] - - [729, 9157.98] + - [732, 9157.98] - - [4096, 1024, 1, 10240] - - [729, 9658.74] + - [732, 9658.74] - - [1024, 1024, 1, 8320] - - [831, 8799.48] + - [834, 8799.48] - - [1024, 256, 1, 9216] - - [821, 6375.13] + - [824, 6375.13] - - [4096, 256, 1, 1152] - - [816, 8300.99] + - [819, 8300.99] - - [512, 200, 1, 2560] - - [782, 3088.41] + - [785, 3088.41] - - [2048, 256, 1, 1920] - - [821, 7714.84] + - [824, 7714.84] - - [2048, 1024, 1, 4608] - - [729, 9305.6] + - [732, 9305.6] - - [512, 256, 1, 1024] - - [829, 2887.64] + - [832, 2887.64] - - [1024, 256, 1, 1920] - - [813, 5913.02] + - [816, 5913.02] - - [4096, 512, 1, 3584] - - [729, 9275.59] + - [732, 9275.59] - - [2048, 512, 1, 4160] - - [834, 8733.93] + - [837, 8733.93] - - [2048, 512, 1, 5632] - - [837, 8758.88] + - [840, 8758.88] - - [4096, 1024, 1, 4608] - - [729, 9657.12] + - [732, 9657.12] - - [4096, 1024, 1, 3328] - - [729, 9621.35] + - [732, 9621.35] - - [4096, 256, 1, 7168] - - [823, 8769.95] + - [826, 8769.95] - - [4096, 200, 1, 128] - - [837, 4458.23] + - [840, 4458.23] - - [2048, 200, 1, 5120] - - [821, 6176.81] + - [824, 6176.81] - - [1024, 1024, 1, 6656] - - [823, 8780.35] + - [826, 8780.35] - - [512, 1024, 1, 3200] - - [832, 7886.99] + - [835, 7886.99] - - [512, 200, 1, 2304] - - [722, 2990.99] + - [725, 2990.99] - - [2048, 1024, 1, 9216] - - [734, 9325.36] + - [737, 9325.36] - - [2048, 256, 1, 1536] - - [832, 7551.63] + - [835, 7551.63] - - [4096, 256, 1, 256] - - [837, 6932.73] + - [840, 6932.73] - - [2048, 512, 1, 1408] - - [834, 8430.76] + - [837, 8430.76] - - [1024, 256, 1, 384] - - [826, 4462.03] + - [829, 4462.03] - - [2048, 1024, 1, 2304] - - [729, 9174.84] + - [732, 9174.84] - - [4096, 512, 1, 6144] - - [731, 9284.15] + - [734, 9284.15] - - [1024, 200, 1, 14336] - - [720, 5268.47] + - [723, 5268.47] - - [1024, 512, 1, 2080] - - [832, 7736.37] + - [835, 7736.37] - - [2048, 512, 1, 2304] - - [834, 8615.97] + - [837, 8615.97] - - [4096, 512, 1, 15360] - - [734, 9362.07] + - [737, 9362.07] - - [1024, 256, 1, 32] - - [754, 1028.02] + - [757, 1028.02] - - [1024, 200, 1, 2816] - - [832, 4780.48] + - [835, 4780.48] - - [4096, 200, 1, 512] - - [823, 6054.13] + - [826, 6054.13] - - [4096, 1024, 1, 7168] - - [734, 9468.39] + - [737, 9468.39] - - [2048, 256, 1, 14336] - - [795, 7865.42] + - [798, 7865.42] - - [1024, 200, 1, 3072] - - [832, 4804.1] + - [835, 4804.1] - - [2048, 200, 1, 1280] - - [832, 5846.21] + - [835, 5846.21] - - [1024, 1024, 1, 2304] - - [823, 8633.22] + - [826, 8633.22] - - [4096, 1024, 1, 9216] - - [729, 9640.93] + - [732, 9640.93] - - [2048, 512, 1, 4608] - - [834, 8743.2] + - [837, 8743.2] - - [4096, 1024, 1, 7680] - - [729, 9684.76] + - [732, 9684.76] - - [4096, 256, 1, 6144] - - [834, 8757.14] + - [837, 8757.14] - - [4096, 256, 1, 896] - - [827, 8258.83] + - [830, 8258.83] - - [512, 256, 1, 1536] - - [811, 3065.26] + - [814, 3065.26] - - [1024, 256, 1, 512] - - [821, 4752.75] + - [824, 4752.75] - - [2048, 256, 1, 640] - - [821, 6775.94] + - [824, 6775.94] - - [256, 256, 1, 2048] - - [758, 2248.96] + - [761, 2248.96] - - [2048, 1024, 1, 8192] - - [729, 9178.07] + - [732, 9178.07] - - [4096, 200, 1, 16640] - - [721, 7009.49] + - [724, 7009.49] - - [256, 512, 1, 512] - - [733, 2511.56] + - [736, 2511.56] - - [2048, 512, 1, 384] - - [834, 7467.6] + - [837, 7467.6] - - [2048, 200, 1, 16384] - - [802, 6327.21] + - [805, 6327.21] - - [4096, 200, 1, 10240] - - [827, 6892.64] + - [830, 6892.64] - - [1024, 512, 1, 9216] - - [776, 7529.99] + - [779, 7529.99] - - [4096, 1024, 1, 64] - - [751, 6260.16] + - [754, 6260.16] - - [4096, 200, 1, 1920] - - [837, 6710.17] + - [840, 6710.17] - - [2048, 1024, 1, 1280] - - [729, 8998.24] + - [732, 8998.24] - - [1024, 200, 1, 3840] - - [821, 4873.77] + - [824, 4873.77] - - [256, 1024, 1, 512] - - [832, 4766.25] + - [835, 4766.25] - - [2048, 1024, 1, 3328] - - [729, 9275.1] + - [732, 9275.1] - - [1024, 256, 1, 16640] - - [786, 6837.12] + - [789, 6837.12] - - [4096, 512, 1, 14336] - - [734, 9354.32] + - [737, 9354.32] - - [1024, 1024, 1, 16640] - - [831, 8832.27] + - [834, 8832.27] - - [1024, 256, 1, 1152] - - [832, 5642.56] + - [835, 5642.56] - - [512, 512, 1, 512] - - [821, 4779.83] + - [824, 4779.83] - - [4096, 512, 1, 8320] - - [734, 9327.86] + - [737, 9327.86] - - [2048, 512, 1, 7680] - - [837, 8793.86] + - [840, 8793.86] - - [4096, 1024, 1, 6656] - - [729, 9666.93] + - [732, 9666.93] - - [1024, 512, 1, 3584] - - [832, 7900.47] + - [835, 7900.47] - - [1024, 1024, 1, 32] - - [817, 2974.68] + - [820, 2974.68] - - [512, 512, 1, 2816] - - [813, 6155.75] + - [816, 6155.75] - - [2048, 512, 1, 1664] - - [837, 8496.45] + - [840, 8496.45] - - [1024, 1024, 1, 14336] - - [723, 8624.64] + - [726, 8624.64] - - [2048, 200, 1, 2048] - - [832, 6029.76] + - [835, 6029.76] - - [1024, 1024, 1, 3584] - - [823, 8702.52] + - [826, 8702.52] - - [512, 200, 1, 1280] - - [737, 2350.65] + - [740, 2350.65] - - [4096, 256, 1, 6656] - - [837, 8788.31] + - [840, 8788.31] - - [4096, 256, 1, 4160] - - [814, 8728.34] + - [817, 8728.34] - - [128, 256, 1, 1024] - - [796, 859.489] + - [799, 859.489] - - [512, 200, 1, 3200] - - [737, 3376.75] + - [740, 3376.75] - - [2048, 512, 1, 9216] - - [820, 8806.3] + - [823, 8806.3] - - [2048, 1024, 1, 256] - - [816, 7713.66] + - [819, 7713.66] - - [1024, 256, 1, 2304] - - [832, 6015.73] + - [835, 6015.73] - - [1024, 200, 1, 8192] - - [832, 5021.92] + - [835, 5021.92] - - [2048, 256, 1, 3072] - - [749, 7514.99] + - [752, 7514.99] - - [2048, 256, 1, 8320] - - [821, 8063.58] + - [824, 8063.58] - - [4096, 512, 1, 1024] - - [731, 8824.31] + - [734, 8824.31] - - [1024, 512, 1, 3200] - - [821, 7866.29] + - [824, 7866.29] - - [1024, 512, 1, 896] - - [813, 7161.01] + - [816, 7161.01] - - [2048, 512, 1, 1280] - - [827, 8384.42] + - [830, 8384.42] - - [4096, 200, 1, 64] - - [736, 3260.5] + - [739, 3260.5] - - [1024, 256, 1, 6144] - - [842, 6143.62] + - [845, 6143.62] - - [1024, 200, 1, 2560] - - [821, 4762.79] + - [824, 4762.79] - - [1024, 1024, 1, 5120] - - [750, 8454.13] + - [753, 8454.13] - - [2048, 512, 1, 6656] - - [827, 8798.95] + - [830, 8798.95] - - [4096, 1024, 1, 1536] - - [729, 9503.27] + - [732, 9503.27] - - [1024, 1024, 1, 128] - - [752, 5825.42] + - [755, 5825.42] - - [512, 1024, 1, 1792] - - [821, 7701.02] + - [824, 7701.02] - - [2048, 1024, 1, 32] - - [732, 3938.31] + - [735, 3938.31] - - [4096, 256, 1, 2816] - - [816, 8652.1] + - [819, 8652.1] - - [1024, 1024, 1, 15360] - - [723, 8719.6] + - [726, 8719.6] - - [1024, 256, 1, 5632] - - [821, 6344.08] + - [824, 6344.08] - - [1024, 1024, 1, 4096] - - [824, 8187.76] + - [827, 8187.76] - - [2048, 200, 1, 4160] - - [832, 6222.38] + - [835, 6222.38] - - [512, 256, 1, 768] - - [763, 2771.57] + - [766, 2771.57] - - [4096, 512, 1, 640] - - [734, 8590.48] + - [737, 8590.48] - - [2048, 512, 1, 8192] - - [776, 8494.8] + - [779, 8494.8] - - [1024, 512, 1, 768] - - [821, 7049.25] + - [824, 7049.25] - - [4096, 200, 1, 8320] - - [816, 6908.6] + - [819, 6908.6] - - [2048, 512, 1, 896] - - [823, 8224.13] + - [826, 8224.13] - - [4096, 200, 1, 7168] - - [834, 6878.49] + - [837, 6878.49] - - [2048, 512, 1, 13312] - - [822, 8802.94] + - [825, 8802.94] - - [64, 512, 1, 1024] - - [726, 843.924] + - [729, 843.924] - - [2048, 200, 1, 3840] - - [821, 6192.38] + - [824, 6192.38] - - [1024, 1024, 1, 768] - - [814, 8098.41] + - [817, 8098.41] - - [4096, 512, 1, 16384] - - [734, 9345.63] + - [737, 9345.63] - - [4096, 256, 1, 2304] - - [814, 8596.35] + - [817, 8596.35] - - [1, 256, 1, 4096] - - [819, 19.8293] + - [822, 19.8293] - - [1024, 1024, 1, 11264] - - [824, 8491.38] + - [827, 8491.38] - - [2048, 200, 1, 16640] - - [818, 6510.54] + - [821, 6510.54] - - [1024, 256, 1, 3072] - - [832, 6179.45] + - [835, 6179.45] - - [4096, 1024, 1, 512] - - [729, 9032.15] + - [732, 9032.15] - - [2048, 256, 1, 2816] - - [821, 7793.47] + - [824, 7793.47] - - [32, 512, 1, 512] - - [733, 318.716] + - [736, 318.716] - - [256, 512, 1, 2048] - - [784, 3368.92] + - [787, 3368.92] - - [1024, 512, 1, 384] - - [832, 6198.48] + - [835, 6198.48] - - [2048, 200, 1, 7680] - - [821, 6307.6] + - [824, 6307.6] - - [1024, 512, 1, 4608] - - [832, 7953.38] + - [835, 7953.38] - - [4096, 200, 1, 32] - - [781, 2199.19] + - [784, 2199.19] - - [4096, 200, 1, 3328] - - [816, 6813.02] + - [819, 6813.02] - - [1024, 200, 1, 1152] - - [821, 4375.55] + - [824, 4375.55] - - [1024, 1024, 1, 1408] - - [823, 8457.81] + - [826, 8457.81] - - [2048, 200, 1, 15360] - - [797, 6333.0] + - [800, 6333.0] - - [512, 1024, 1, 2048] - - [807, 6280.66] + - [810, 6280.66] - - [1024, 512, 1, 1024] - - [832, 7064.09] + - [835, 7064.09] - - [1024, 200, 1, 10240] - - [821, 5030.59] + - [824, 5030.59] - - [4096, 256, 1, 5632] - - [834, 8765.12] + - [837, 8765.12] - - [512, 512, 1, 3072] - - [844, 5942.34] + - [847, 5942.34] - - [2048, 256, 1, 1408] - - [821, 7544.95] + - [824, 7544.95] - - [2048, 256, 1, 6144] - - [832, 7963.87] + - [835, 7963.87] - - [4096, 256, 1, 3328] - - [827, 8682.48] + - [830, 8682.48] - - [1024, 200, 1, 1664] - - [821, 4595.3] + - [824, 4595.3] - - [2048, 1024, 1, 1152] - - [729, 8942.55] + - [732, 8942.55] - - [2048, 512, 1, 6144] - - [822, 8729.61] + - [825, 8729.61] - - [2048, 512, 1, 3200] - - [823, 8696.46] + - [826, 8696.46] - - [4096, 1024, 1, 2080] - - [762, 9538.35] + - [765, 9538.35] - - [4096, 1024, 1, 768] - - [729, 9260.65] + - [732, 9260.65] - - [4096, 1024, 1, 2560] - - [729, 9567.17] + - [732, 9567.17] - - [64, 200, 1, 2048] - - [761, 583.061] + - [764, 583.061] - - [2048, 200, 1, 4608] - - [832, 6243.18] + - [835, 6243.18] - - [1024, 1024, 1, 6144] - - [824, 8320.15] + - [827, 8320.15] - - [4096, 256, 1, 1664] - - [827, 8503.07] + - [830, 8503.07] - - [2048, 200, 1, 384] - - [832, 4939.9] + - [835, 4939.9] - - [1, 200, 1, 2048] - - [778, 11.2281] + - [781, 11.2281] - - [4096, 256, 1, 1792] - - [837, 8504.02] + - [840, 8504.02] - - [2048, 1024, 1, 64] - - [751, 5309.25] + - [754, 5309.25] - - [4096, 1024, 1, 16384] - - [718, 9428.51] + - [721, 9428.51] - - [1024, 512, 1, 16640] - - [832, 8122.45] + - [835, 8122.45] - - [2048, 512, 1, 10240] - - [822, 8766.11] + - [825, 8766.11] - - [4096, 512, 1, 6656] - - [729, 9351.65] + - [732, 9351.65] - - [2048, 256, 1, 16640] - - [821, 8135.17] + - [824, 8135.17] - - [2048, 512, 1, 2816] - - [823, 8660.22] + - [826, 8660.22] - - [1024, 200, 1, 32] - - [741, 780.191] + - [744, 780.191] - - [1, 512, 1, 4096] - - [766, 34.7671] + - [769, 34.7671] - - [256, 256, 1, 1024] - - [773, 1489.98] + - [776, 1489.98] - - [2048, 1024, 1, 128] - - [746, 6605.2] + - [749, 6605.2] - - [2048, 1024, 1, 2080] - - [729, 9159.41] + - [732, 9159.41] - - [2048, 1024, 1, 16640] - - [729, 9371.55] + - [732, 9371.55] - - [1024, 200, 1, 384] - - [832, 3378.14] + - [835, 3378.14] - - [4096, 256, 1, 384] - - [777, 7369.2] + - [780, 7369.2] - - [4096, 256, 1, 13312] - - [831, 8776.38] + - [834, 8776.38] - - [2048, 256, 1, 128] - - [826, 4279.9] + - [829, 4279.9] - - [512, 256, 1, 2304] - - [738, 3584.88] + - [741, 3584.88] - - [2048, 1024, 1, 3072] - - [731, 9156.42] + - [734, 9156.42] - - [1024, 1024, 1, 640] - - [827, 7928.74] + - [830, 7928.74] - - [256, 512, 1, 1024] - - [832, 2843.6] + - [835, 2843.6] - - [4096, 1024, 1, 1408] - - [729, 9437.46] + - [732, 9437.46] - - [4096, 200, 1, 5632] - - [834, 6873.86] + - [837, 6873.86] - - [4096, 1024, 1, 2048] - - [729, 9437.0] + - [732, 9437.0] - - [2048, 1024, 1, 2560] - - [734, 9195.52] + - [737, 9195.52] - - [4096, 1024, 1, 128] - - [816, 7407.16] + - [819, 7407.16] - - [1024, 200, 1, 3328] - - [832, 4857.29] + - [835, 4857.29] - - [2048, 200, 1, 1152] - - [821, 5760.0] + - [824, 5760.0] - - [1024, 200, 1, 9216] - - [720, 5053.11] + - [723, 5053.11] - - [4096, 256, 1, 512] - - [814, 7617.35] + - [817, 7617.35] - - [4096, 1024, 1, 14336] - - [729, 9665.02] + - [732, 9665.02] - - [1024, 1024, 1, 384] - - [752, 7478.7] + - [755, 7478.7] - - [2048, 200, 1, 512] - - [821, 5150.18] + - [824, 5150.18] - - [2048, 256, 1, 9216] - - [800, 7717.61] + - [803, 7717.61] - - [2048, 256, 1, 1792] - - [821, 7655.84] + - [824, 7655.84] - - [4096, 512, 1, 9216] - - [731, 9331.12] + - [734, 9331.12] - - [4096, 200, 1, 15360] - - [721, 6958.04] + - [724, 6958.04] - - [1024, 512, 1, 2048] - - [820, 7067.81] + - [823, 7067.81] - - [64, 256, 1, 2048] - - [745, 723.156] + - [748, 723.156] - - [4096, 200, 1, 1792] - - [823, 6699.55] + - [826, 6699.55] - - [1, 200, 1, 4096] - - [755, 15.5387] + - [758, 15.5387] - - [2048, 1024, 1, 2048] - - [734, 9071.83] + - [737, 9071.83] - - [1024, 200, 1, 2080] - - [813, 4679.09] + - [816, 4679.09] - - [2048, 200, 1, 1536] - - [832, 5939.82] + - [835, 5939.82] - - [1024, 1024, 1, 3072] - - [794, 8333.05] + - [797, 8333.05] - - [512, 200, 1, 1792] - - [719, 2679.63] + - [722, 2679.63] - - [1024, 256, 1, 11264] - - [722, 6470.88] + - [725, 6470.88] - - [2048, 512, 1, 12288] - - [769, 8729.14] + - [772, 8729.14] - - [1024, 256, 1, 1792] - - [832, 5931.34] + - [835, 5931.34] - - [1024, 200, 1, 7168] - - [832, 4970.23] + - [835, 4970.23] - - [32, 256, 1, 1024] - - [743, 237.234] + - [746, 237.234] - - [512, 256, 1, 3072] - - [786, 3813.0] + - [789, 3813.0] - - [1024, 1024, 1, 2080] - - [823, 8600.31] + - [826, 8600.31] - - [2048, 200, 1, 2304] - - [832, 6093.22] + - [835, 6093.22] - - [4096, 512, 1, 1536] - - [729, 9074.9] + - [732, 9074.9] - - [2048, 256, 1, 7168] - - [832, 7895.16] + - [835, 7895.16] - - [2048, 512, 1, 1792] - - [834, 8531.82] + - [837, 8531.82] - - [1024, 200, 1, 2048] - - [821, 4685.33] + - [824, 4685.33] - - [1024, 1024, 1, 4608] - - [827, 8735.61] + - [830, 8735.61] - - [4096, 256, 1, 8192] - - [823, 8782.45] + - [826, 8782.45] - - [512, 1024, 1, 1280] - - [813, 7483.15] + - [816, 7483.15] - - [2048, 1024, 1, 16384] - - [723, 8878.86] + - [726, 8878.86] - - [512, 512, 1, 1280] - - [821, 5745.62] + - [824, 5745.62] - - [1024, 200, 1, 1280] - - [813, 4446.13] + - [816, 4446.13] - - [2048, 256, 1, 3200] - - [821, 7842.75] + - [824, 7842.75] - - [2048, 512, 1, 15360] - - [769, 8757.14] + - [772, 8757.14] - - [1024, 512, 1, 3328] - - [821, 7853.94] + - [824, 7853.94] - - [1024, 512, 1, 4160] - - [821, 7934.51] + - [824, 7934.51] - - [4096, 200, 1, 6656] - - [823, 6883.2] + - [826, 6883.2] - - [4096, 1024, 1, 1024] - - [729, 9229.34] + - [732, 9229.34] - - [2048, 200, 1, 3328] - - [832, 6182.64] + - [835, 6182.64] - - [1024, 1024, 1, 256] - - [752, 6932.73] + - [755, 6932.73] - - [512, 200, 1, 512] - - [773, 1910.67] + - [776, 1910.67] - - [2048, 256, 1, 64] - - [744, 2912.71] + - [747, 2912.71] - - [1024, 256, 1, 2560] - - [821, 6123.07] + - [824, 6123.07] - - [2048, 512, 1, 11264] - - [833, 8728.84] + - [836, 8728.84] - - [32, 200, 1, 1024] - - [828, 187.46] + - [831, 187.46] - - [32, 512, 1, 2048] - - [772, 694.421] + - [775, 694.421] - - [2048, 256, 1, 2304] - - [821, 7759.25] + - [824, 7759.25] - - [2048, 256, 1, 12288] - - [800, 7726.25] + - [803, 7726.25] - - [4096, 200, 1, 8192] - - [823, 6870.84] + - [826, 6870.84] - - [1024, 512, 1, 7168] - - [769, 7479.1] + - [772, 7479.1] - - [1024, 512, 1, 1792] - - [821, 7626.01] + - [824, 7626.01] - - [4096, 1024, 1, 1664] - - [729, 9503.44] + - [732, 9503.44] - - [4096, 200, 1, 2816] - - [816, 6775.34] + - [819, 6775.34] - - [1024, 1024, 1, 896] - - [823, 8229.89] + - [826, 8229.89] - - [1024, 200, 1, 8320] - - [784, 5173.48] + - [787, 5173.48] - - [1024, 1024, 1, 12288] - - [824, 8463.11] + - [827, 8463.11] - - [1024, 256, 1, 8320] - - [813, 6404.27] + - [816, 6404.27] - - [1024, 200, 1, 1024] - - [821, 4297.44] + - [824, 4297.44] - - [1024, 200, 1, 16640] - - [783, 5499.41] + - [786, 5499.41] - - [4096, 256, 1, 5120] - - [837, 8729.05] + - [840, 8729.05] - - [1024, 256, 1, 3200] - - [832, 6124.86] + - [835, 6124.86] - - [512, 512, 1, 2560] - - [832, 6109.69] + - [835, 6109.69] - - [4096, 256, 1, 2048] - - [837, 8510.95] + - [840, 8510.95] - - [1024, 256, 1, 640] - - [821, 5102.56] + - [824, 5102.56] - - [2048, 256, 1, 5120] - - [749, 7667.83] + - [752, 7667.83] - - [2048, 256, 1, 7680] - - [832, 8054.35] + - [835, 8054.35] - - [4096, 512, 1, 384] - - [827, 8190.67] + - [830, 8190.67] - - [2048, 200, 1, 3584] - - [821, 6166.02] + - [824, 6166.02] - - [1024, 512, 1, 1536] - - [821, 7517.8] + - [824, 7517.8] - - [4096, 512, 1, 3328] - - [729, 9259.35] + - [732, 9259.35] - - [4096, 1024, 1, 256] - - [729, 8341.69] + - [732, 8341.69] - - [2048, 200, 1, 64] - - [792, 2307.61] + - [795, 2307.61] - - [2048, 200, 1, 4096] - - [832, 6211.94] + - [835, 6211.94] - - [1024, 1024, 1, 1536] - - [823, 8484.05] + - [826, 8484.05] - - [2048, 1024, 1, 7168] - - [731, 9315.14] + - [734, 9315.14] - - [1024, 256, 1, 3584] - - [821, 6207.22] + - [824, 6207.22] - - [4096, 256, 1, 32] - - [825, 2892.62] + - [828, 2892.62] - - [4096, 256, 1, 1280] - - [834, 8392.8] + - [837, 8392.8] - - [512, 512, 1, 3200] - - [832, 6219.31] + - [835, 6219.31] - - [2048, 1024, 1, 1536] - - [731, 9052.45] + - [734, 9052.45] - - [2048, 256, 1, 1024] - - [821, 7192.8] + - [824, 7192.8] - - [128, 200, 1, 512] - - [811, 502.577] + - [814, 502.577] - - [4096, 512, 1, 7168] - - [734, 9329.01] + - [737, 9329.01] - - [1024, 512, 1, 1152] - - [821, 7358.43] + - [824, 7358.43] - - [64, 1024, 1, 2048] - - [739, 2102.41] + - [742, 2102.41] - - [2048, 512, 1, 3328] - - [823, 8694.59] + - [826, 8694.59] - - [4096, 1024, 1, 896] - - [729, 9342.92] + - [732, 9342.92] - - [1, 1024, 1, 2048] - - [779, 40.8324] + - [782, 40.8324] - - [4096, 200, 1, 3584] - - [827, 6810.2] + - [830, 6810.2] - - [4096, 1024, 1, 4096] - - [729, 9347.46] + - [732, 9347.46] - - [1024, 256, 1, 14336] - - [722, 6625.7] + - [725, 6625.7] - - [2048, 200, 1, 256] - - [821, 4413.2] + - [824, 4413.2] - - [4096, 256, 1, 16384] - - [723, 8752.03] + - [726, 8752.03] - - [4096, 256, 1, 1920] - - [814, 8533.68] + - [817, 8533.68] - - [32, 1024, 1, 512] - - [812, 647.269] + - [815, 647.269] - - [1024, 256, 1, 7680] - - [832, 6387.26] + - [835, 6387.26] - - [2048, 256, 1, 1664] - - [832, 7631.34] + - [835, 7631.34] - - [512, 200, 1, 1536] - - [737, 2576.78] + - [740, 2576.78] - - [2048, 1024, 1, 6144] - - [718, 9033.67] + - [721, 9033.67] - - [512, 256, 1, 2816] - - [784, 3977.36] + - [787, 3977.36] - - [4096, 512, 1, 4160] - - [731, 9288.92] + - [734, 9288.92] - - [4096, 512, 1, 2080] - - [810, 9150.18] + - [813, 9150.18] - - [2048, 256, 1, 15360] - - [795, 7963.87] + - [798, 7963.87] - - [4096, 200, 1, 5120] - - [834, 6861.52] + - [837, 6861.52] - - [1024, 512, 1, 8192] - - [820, 7473.15] + - [823, 7473.15] - - [4096, 200, 1, 896] - - [837, 6443.15] + - [840, 6443.15] - - [2048, 512, 1, 8320] - - [827, 8810.14] + - [830, 8810.14] - - [1024, 1024, 1, 10240] - - [835, 8436.6] + - [838, 8436.6] - - [1024, 200, 1, 768] - - [821, 4087.48] + - [824, 4087.48] - - [2048, 200, 1, 640] - - [832, 5416.2] + - [835, 5416.2] - - [512, 200, 1, 2048] - - [786, 2702.52] + - [789, 2702.52] - - [1024, 1024, 1, 9216] - - [824, 8498.98] + - [827, 8498.98] - - [4096, 200, 1, 1408] - - [834, 6613.72] + - [837, 6613.72] - - [1024, 256, 1, 13312] - - [722, 6643.44] + - [725, 6643.44] - - [1024, 256, 1, 128] - - [753, 2706.0] + - [756, 2706.0] - - [2048, 200, 1, 5632] - - [832, 6270.02] + - [835, 6270.02] - - [64, 1024, 1, 512] - - [811, 1310.72] + - [814, 1310.72] - - [1024, 512, 1, 2560] - - [832, 7731.44] + - [835, 7731.44] - - [4096, 200, 1, 1280] - - [814, 6566.73] + - [817, 6566.73] - - [1024, 200, 1, 4096] - - [832, 4911.36] + - [835, 4911.36] - - [1024, 1024, 1, 2560] - - [823, 8630.25] + - [826, 8630.25] - - [2048, 512, 1, 64] - - [827, 4152.78] + - [830, 4152.78] - - [2048, 200, 1, 8192] - - [821, 6234.11] + - [824, 6234.11] - - [2048, 512, 1, 3072] - - [831, 8614.75] + - [834, 8614.75] - - [4096, 1024, 1, 5120] - - [729, 9573.65] + - [732, 9573.65] - - [4096, 256, 1, 640] - - [816, 7913.78] + - [819, 7913.78] - - [1024, 256, 1, 1280] - - [821, 5706.54] + - [824, 5706.54] - - [2048, 1024, 1, 1920] - - [731, 9141.24] + - [734, 9141.24] - - [2048, 256, 1, 4096] - - [821, 7937.18] + - [824, 7937.18] - - [2048, 1024, 1, 15360] - - [734, 9351.86] + - [737, 9351.86] - - [4096, 200, 1, 16384] - - [723, 6975.11] + - [726, 6975.11] - - [1, 1024, 1, 4096] - - [841, 60.6815] + - [844, 60.6815] - - [4096, 1024, 1, 2816] - - [729, 9583.88] + - [732, 9583.88] - - [4096, 200, 1, 1664] - - [816, 6658.6] + - [819, 6658.6] - - [4096, 512, 1, 256] - - [747, 7731.44] + - [750, 7731.44] - - [1024, 200, 1, 896] - - [821, 4193.35] + - [824, 4193.35] - - [2048, 200, 1, 6656] - - [832, 6291.07] + - [835, 6291.07] - - [2048, 1024, 1, 5120] - - [731, 9270.47] + - [734, 9270.47] - - [512, 1024, 1, 768] - - [821, 7098.96] + - [824, 7098.96] - - [2048, 512, 1, 14336] - - [801, 8559.03] + - [804, 8559.03] - - [2048, 200, 1, 8320] - - [821, 6314.62] + - [824, 6314.62] - - [4096, 256, 1, 3840] - - [837, 8718.46] + - [840, 8718.46] - - [2048, 1024, 1, 4096] - - [718, 8973.28] + - [721, 8973.28] - - [1024, 1024, 1, 3200] - - [827, 8701.88] + - [830, 8701.88] - - [1024, 256, 1, 4608] - - [821, 6267.95] + - [824, 6267.95] - - [4096, 512, 1, 4608] - - [729, 9316.37] + - [732, 9316.37] - - [2048, 512, 1, 2048] - - [820, 8462.66] + - [823, 8462.66] - - [4096, 512, 1, 1664] - - [729, 9074.43] + - [732, 9074.43] - - [4096, 256, 1, 4608] - - [816, 8717.95] + - [819, 8717.95] - - [1024, 512, 1, 32] - - [809, 1807.89] + - [812, 1807.89] - - [1024, 512, 1, 3840] - - [821, 7936.24] + - [824, 7936.24] - - [2048, 512, 1, 1920] - - [837, 8548.17] + - [840, 8548.17] - - [2048, 1024, 1, 896] - - [729, 8843.41] + - [732, 8843.41] - - [4096, 200, 1, 6144] - - [837, 6864.66] + - [840, 6864.66] - - [1024, 512, 1, 13312] - - [790, 7763.09] + - [793, 7763.09] - - [4096, 1024, 1, 4160] - - [729, 9650.62] + - [732, 9650.62] - - [2048, 200, 1, 2816] - - [821, 6119.66] + - [824, 6119.66] - - [1024, 1024, 1, 3840] - - [816, 8709.4] + - [819, 8709.4] - - [128, 1024, 1, 1024] - - [839, 2577.15] + - [842, 2577.15] - - [2048, 1024, 1, 11264] - - [734, 9338.96] + - [737, 9338.96] - - [2048, 1024, 1, 384] - - [823, 8210.71] + - [826, 8210.71] - - [1024, 256, 1, 2048] - - [844, 5755.48] + - [847, 5755.48] - - [2048, 1024, 1, 3840] - - [731, 9288.86] + - [734, 9288.86] - - [4096, 256, 1, 8320] - - [837, 8812.28] + - [840, 8812.28] - - [2048, 256, 1, 3840] - - [813, 7856.95] + - [816, 7856.95] - - [64, 256, 1, 512] - - [811, 336.082] + - [814, 336.082] - - [4096, 512, 1, 1280] - - [731, 8993.42] + - [734, 8993.42] - - [512, 256, 1, 1280] - - [763, 2995.93] + - [766, 2995.93] - - [1024, 512, 1, 7680] - - [821, 8041.49] + - [824, 8041.49] - - [4096, 1024, 1, 1152] - - [729, 9368.38] + - [732, 9368.38] - - [256, 200, 1, 512] - - [763, 992.97] + - [766, 992.97] - - [256, 1024, 1, 2048] - - [840, 4759.49] + - [843, 4759.49] - - [2048, 200, 1, 10240] - - [832, 6328.93] + - [835, 6328.93] - - [2048, 512, 1, 5120] - - [833, 8732.46] + - [836, 8732.46] - - [2048, 1024, 1, 1408] - - [731, 9006.8] + - [734, 9006.8] - - [512, 1024, 1, 512] - - [821, 6528.1] + - [824, 6528.1] - - [1024, 200, 1, 11264] - - [788, 5194.72] + - [791, 5194.72] - - [512, 1024, 1, 1024] - - [774, 6337.0] + - [777, 6337.0] - - [2048, 512, 1, 32] - - [740, 2777.68] + - [743, 2777.68] - - [4096, 256, 1, 2560] - - [823, 8621.39] + - [826, 8621.39] - - [4096, 256, 1, 64] - - [757, 4194.3] + - [760, 4194.3] - - [32, 1024, 1, 1024] - - [758, 778.164] + - [761, 778.164] - - [2048, 200, 1, 768] - - [832, 5507.23] + - [835, 5507.23] - - [512, 512, 1, 2048] - - [780, 5338.81] + - [783, 5338.81] - - [2048, 512, 1, 2560] - - [834, 8643.59] + - [837, 8643.59] - - [512, 256, 1, 512] - - [813, 2542.0] + - [816, 2542.0] - - [1024, 200, 1, 7680] - - [788, 5047.7] + - [791, 5047.7] - - [4096, 512, 1, 896] - - [729, 8856.75] + - [732, 8856.75] - - [4096, 1024, 1, 3072] - - [729, 9492.07] + - [732, 9492.07] - - [4096, 200, 1, 13312] - - [721, 6900.63] + - [724, 6900.63] - - [2048, 512, 1, 7168] - - [822, 8788.0] + - [825, 8788.0] - - [2048, 1024, 1, 2816] - - [734, 9229.78] + - [737, 9229.78] - - [2048, 512, 1, 128] - - [752, 5629.94] + - [755, 5629.94] - - [1024, 256, 1, 8192] - - [844, 6203.73] + - [847, 6203.73] - - [4096, 1024, 1, 1792] - - [729, 9510.32] + - [732, 9510.32] - - [1024, 200, 1, 6656] - - [813, 5002.75] + - [816, 5002.75] - - [1024, 1024, 1, 1024] - - [750, 8095.16] + - [753, 8095.16] - - [4096, 200, 1, 2304] - - [834, 6754.35] + - [837, 6754.35] - - [4096, 512, 1, 1152] - - [729, 8974.44] + - [732, 8974.44] - - [512, 200, 1, 1024] - - [811, 2232.91] + - [814, 2232.91] - - [1024, 256, 1, 3840] - - [832, 6244.62] + - [835, 6244.62] - - [512, 512, 1, 768] - - [821, 5331.74] + - [824, 5331.74] - - [2048, 512, 1, 4096] - - [831, 8621.66] + - [834, 8621.66] - - [2048, 256, 1, 2560] - - [821, 7770.83] + - [824, 7770.83] - - [2048, 256, 1, 4160] - - [832, 7922.98] + - [835, 7922.98] - - [1024, 256, 1, 64] - - [728, 1705.0] + - [731, 1705.0] - - [4096, 512, 1, 7680] - - [729, 9364.47] + - [732, 9364.47] - - [1024, 512, 1, 1664] - - [832, 7594.14] + - [835, 7594.14] - - [2048, 512, 1, 2080] - - [823, 8570.57] + - [826, 8570.57] - - [2048, 512, 1, 3840] - - [834, 8729.04] + - [837, 8729.04] - - [4096, 1024, 1, 384] - - [729, 8764.76] + - [732, 8764.76] - - [4096, 200, 1, 3072] - - [823, 6772.29] + - [826, 6772.29] - - [1024, 512, 1, 14336] - - [791, 7680.87] + - [794, 7680.87] - - [1024, 200, 1, 1920] - - [813, 4636.98] + - [816, 4636.98] - - [1024, 1024, 1, 1664] - - [827, 8506.39] + - [830, 8506.39] - - [512, 1024, 1, 2304] - - [821, 7775.23] + - [824, 7775.23] - - [2048, 1024, 1, 1792] - - [729, 9123.36] + - [732, 9123.36] - - [32, 200, 1, 512] - - [829, 125.644] + - [832, 125.644] - - [4096, 256, 1, 11264] - - [834, 8822.21] + - [837, 8822.21] - - [4096, 256, 1, 1408] - - [834, 8419.22] + - [837, 8419.22] - - [1024, 256, 1, 7168] - - [821, 6377.44] + - [824, 6377.44] - - [2048, 256, 1, 1152] - - [832, 7401.71] + - [835, 7401.71] - - [256, 256, 1, 512] - - [811, 1314.83] + - [814, 1314.83] - - [1024, 512, 1, 1280] - - [821, 7410.43] + - [824, 7410.43] - - [512, 512, 1, 1792] - - [813, 5931.34] + - [816, 5931.34] - - [2048, 200, 1, 12288] - - [795, 6242.15] + - [798, 6242.15] - - [2048, 200, 1, 1664] - - [832, 5953.65] + - [835, 5953.65] - - [4096, 200, 1, 4608] - - [827, 6853.44] + - [830, 6853.44] - - [512, 1024, 1, 2560] - - [821, 7778.03] + - [824, 7778.03] - - [4096, 200, 1, 384] - - [814, 5765.63] + - [817, 5765.63] - - [128, 512, 1, 512] - - [811, 1302.58] + - [814, 1302.58] - - [1024, 200, 1, 256] - - [815, 2861.83] + - [818, 2861.83] - - [256, 1024, 1, 1024] - - [756, 4522.16] + - [759, 4522.16] - - [2048, 200, 1, 128] - - [821, 3309.9] + - [824, 3309.9] - - [2048, 200, 1, 11264] - - [802, 6168.1] + - [805, 6168.1] - - [1024, 512, 1, 1920] - - [832, 7649.19] + - [835, 7649.19] - - [4096, 256, 1, 1536] - - [827, 8427.23] + - [830, 8427.23] - - [4096, 1024, 1, 3584] - - [729, 9617.9] + - [732, 9617.9] - - [2048, 256, 1, 256] - - [821, 5464.89] + - [824, 5464.89] - - [2048, 1024, 1, 768] - - [729, 8726.77] + - [732, 8726.77] - - [4096, 256, 1, 10240] - - [823, 8790.79] + - [826, 8790.79] - - [2048, 256, 1, 10240] - - [803, 7665.21] + - [806, 7665.21] - - [4096, 200, 1, 14336] - - [837, 6916.08] + - [840, 6916.08] - - [1024, 512, 1, 5120] - - [775, 7420.26] + - [778, 7420.26] - - [1024, 512, 1, 8320] - - [832, 8061.21] + - [835, 8061.21] - - [256, 200, 1, 2048] - - [787, 1916.26] + - [790, 1916.26] - - [1024, 200, 1, 640] - - [815, 3873.29] + - [818, 3873.29] - - [1024, 512, 1, 10240] - - [820, 7526.8] + - [823, 7526.8] - - [1024, 200, 1, 4160] - - [832, 4928.09] + - [835, 4928.09] - - [1024, 200, 1, 5632] - - [813, 4978.56] + - [816, 4978.56] - - [1024, 1024, 1, 2048] - - [768, 7937.18] + - [771, 7937.18] - - [1024, 256, 1, 6656] - - [832, 6373.58] + - [835, 6373.58] - - [2048, 1024, 1, 8320] - - [729, 9333.05] + - [732, 9333.05] - - [1024, 256, 1, 10240] - - [821, 6407.19] + - [824, 6407.19] - - [2048, 256, 1, 2080] - - [821, 7714.48] + - [824, 7714.48] - - [4096, 256, 1, 128] - - [735, 5765.37] + - [738, 5765.37] - - [1024, 256, 1, 768] - - [826, 5210.32] + - [829, 5210.32] - - [2048, 256, 1, 896] - - [832, 7267.36] + - [835, 7267.36] - - [64, 512, 1, 2048] - - [798, 1296.54] + - [801, 1296.54] - - [4096, 512, 1, 2048] - - [731, 9121.15] + - [734, 9121.15] - - [512, 256, 1, 2048] - - [784, 3283.21] + - [787, 3283.21] - - [4096, 256, 1, 16640] - - [816, 8839.78] + - [819, 8839.78] - - [4096, 512, 1, 2560] - - [734, 9222.05] + - [737, 9222.05] - - [1024, 512, 1, 15360] - - [785, 7865.56] + - [788, 7865.56] - - [4096, 1024, 1, 2304] - - [729, 9558.16] + - [732, 9558.16] - - [4096, 200, 1, 1152] - - [834, 6531.83] + - [837, 6531.83] - - [2048, 200, 1, 6144] - - [832, 6277.65] + - [835, 6277.65] - - [1024, 1024, 1, 7680] - - [827, 8799.24] + - [830, 8799.24] - - [2048, 200, 1, 1920] - - [832, 6030.92] + - [835, 6030.92] - - [32, 1024, 1, 2048] - - [806, 1174.88] + - [809, 1174.88] - - [1024, 200, 1, 3584] - - [813, 4880.34] + - [816, 4880.34] - - [4096, 256, 1, 2080] - - [820, 8557.12] + - [823, 8557.12] - - [1024, 1024, 1, 16384] - - [721, 8618.55] + - [724, 8618.55] - - [1024, 256, 1, 1408] - - [832, 5803.44] + - [835, 5803.44] - - [1024, 256, 1, 4096] - - [842, 6037.68] + - [845, 6037.68] - - [2048, 200, 1, 14336] - - [832, 6364.38] + - [835, 6364.38] - - [4096, 512, 1, 5120] - - [731, 9301.95] + - [734, 9301.95] - - [1024, 512, 1, 6144] - - [767, 7468.99] + - [770, 7468.99] - - [1024, 512, 1, 2304] - - [832, 7759.25] + - [835, 7759.25] - - [4096, 200, 1, 4160] - - [816, 6843.12] + - [819, 6843.12] - - [4096, 200, 1, 1536] - - [827, 6628.17] + - [830, 6628.17] - - [4096, 1024, 1, 6144] - - [729, 9592.98] + - [732, 9592.98] - - [256, 64, 1, 1225] - - [861, 1194.67] + - [864, 1194.67] - - [2048, 320, 1, 64] - - [863, 3449.26] + - [866, 3449.26] - - [1024, 128, 1, 289] - - [867, 2869.68] + - [870, 2869.68] - - [384, 64, 1, 1225] - - [852, 1511.33] + - [855, 1511.33] - - [2048, 384, 1, 64] - - [865, 3836.25] + - [868, 3836.25] - - [64, 80, 1, 5329] - - [864, 888.167] + - [867, 888.167] - - [1024, 384, 1, 289] - - [858, 4291.52] + - [861, 4291.52] - - [2048, 448, 1, 64] - - [857, 3783.52] + - [860, 3783.52] - - [768, 192, 1, 289] - - [862, 2690.33] + - [865, 2690.33] - - [288, 64, 1, 1225] - - [851, 1142.67] + - [854, 1142.67] - - [384, 96, 1, 1225] - - [869, 1844.71] + - [872, 1844.71] - - [1024, 3392, 1, 4096] - - [895, 8502.92] + - [898, 8502.92] - - [1024, 3301, 1, 4096] - - [897, 8414.0] + - [900, 8414.0] - - [1024, 3443, 1, 4096] - - [884, 8536.49] + - [887, 8536.49] - - [132, 134, 480, 64] - - [922, 4149.17] + - [925, 4149.17] - - [162, 162, 400, 64] - - [910, 5539.63] + - [913, 5539.63] - - [4096, 3548, 1, 1024] - - [876, 9772.91] + - [879, 9772.91] - - [4096, 2977, 1, 1024] - - [877, 9574.33] + - [880, 9574.33] - - [132, 135, 480, 64] - - [922, 4167.41] + - [925, 4167.41] - - [1024, 2985, 1, 4096] - - [880, 9133.89] + - [883, 9133.89] - - [33708, 3681, 1, 1024] - - [877, 10033.7] + - [880, 10033.7] - - [4096, 3443, 1, 1024] - - [877, 9513.68] + - [880, 9513.68] - - [11, 11, 5456, 64] - - [919, 627.246] + - [922, 627.246] - - [1024, 3400, 1, 4096] - - [898, 8419.92] + - [901, 8419.92] - - [4096, 3995, 1, 1024] - - [876, 9693.77] + - [879, 9693.77] - - [4096, 3190, 1, 1024] - - [876, 9474.74] + - [879, 9474.74] - - [4096, 3594, 1, 1024] - - [877, 9315.73] + - [880, 9315.73] - - [159, 162, 400, 64] - - [909, 5429.88] + - [912, 5429.88] - - [1024, 3565, 1, 4096] - - [892, 8532.7] + - [895, 8532.7] - - [4096, 3422, 1, 1024] - - [877, 9459.14] + - [880, 9459.14] - - [1024, 3214, 1, 4096] - - [897, 8064.82] + - [900, 8064.82] - - [33708, 3584, 1, 1024] - - [878, 10128.9] + - [881, 10128.9] - - [33708, 3640, 1, 1024] - - [875, 9919.12] + - [878, 9919.12] - - [4096, 3263, 1, 1024] - - [875, 9699.25] + - [878, 9699.25] - - [4096, 3296, 1, 1024] - - [875, 9780.7] + - [878, 9780.7] - - [1024, 3557, 1, 4096] - - [896, 8526.79] + - [899, 8526.79] - - [4096, 3463, 1, 1024] - - [875, 9578.03] + - [878, 9578.03] - - [4096, 3528, 1, 1024] - - [875, 9739.82] + - [878, 9739.82] - - [14, 14, 4368, 64] - - [907, 991.176] + - [910, 991.176] - - [4096, 3226, 1, 1024] - - [875, 9587.09] + - [878, 9587.09] - - [4096, 3439, 1, 1024] - - [878, 9499.62] + - [881, 9499.62] - - [1024, 3523, 1, 4096] - - [898, 8393.48] + - [901, 8393.48] - - [1024, 3098, 1, 4096] - - [904, 7882.77] + - [907, 7882.77] - - [4096, 3121, 1, 1024] - - [875, 9296.13] + - [878, 9296.13] - - [33708, 3894, 1, 1024] - - [876, 9952.17] + - [879, 9952.17] - - [1024, 3548, 1, 4096] - - [882, 8432.35] + - [885, 8432.35] - - [1024, 3451, 1, 4096] - - [895, 8456.34] + - [898, 8456.34] - - [4096, 3353, 1, 1024] - - [877, 9288.98] + - [880, 9288.98] - - [4096, 3402, 1, 1024] - - [877, 9406.34] + - [880, 9406.34] - - [4096, 3939, 1, 1024] - - [875, 9549.49] + - [878, 9549.49] - - [133, 133, 480, 64] - - [922, 4124.21] + - [925, 4124.21] - - [1024, 3559, 1, 4096] - - [897, 8586.94] + - [900, 8586.94] - - [1024, 2977, 1, 4096] - - [880, 9084.49] + - [883, 9084.49] - - [1024, 3478, 1, 4096] - - [891, 8342.75] + - [894, 8342.75] - - [134, 134, 480, 64] - - [924, 4204.33] + - [927, 4204.33] - - [1024, 3368, 1, 4096] - - [897, 8277.33] + - [900, 8277.33] - - [4096, 4012, 1, 1024] - - [877, 9726.47] + - [880, 9726.47] - - [4096, 3486, 1, 1024] - - [875, 9639.61] + - [878, 9639.61] - - [1024, 3479, 1, 4096] - - [885, 8420.27] + - [888, 8420.27] - - [1024, 3505, 1, 4096] - - [897, 8310.56] + - [900, 8310.56] - - [4096, 3381, 1, 1024] - - [878, 9357.65] + - [881, 9357.65] - - [4096, 3430, 1, 1024] - - [875, 9482.26] + - [878, 9482.26] - - [1024, 3554, 1, 4096] - - [897, 8592.28] + - [900, 8592.28] - - [4096, 3271, 1, 1024] - - [875, 9715.31] + - [878, 9715.31] - - [1024, 3063, 1, 4096] - - [879, 9388.46] + - [882, 9388.46] - - [1024, 3209, 1, 4096] - - [897, 8212.64] + - [900, 8212.64] - - [4096, 3503, 1, 1024] - - [877, 9680.49] + - [880, 9680.49] - - [4096, 3344, 1, 1024] - - [875, 9268.45] + - [878, 9268.45] - - [1024, 3147, 1, 4096] - - [898, 8037.1] + - [901, 8037.1] - - [1024, 3322, 1, 4096] - - [896, 8356.22] + - [899, 8356.22] - - [1024, 3341, 1, 4096] - - [897, 8316.23] + - [900, 8316.23] - - [1024, 3516, 1, 4096] - - [879, 8397.02] + - [882, 8397.02] - - [102, 101, 624, 64] - - [910, 4709.49] + - [913, 4709.49] - - [1024, 3454, 1, 4096] - - [896, 8425.5] + - [899, 8425.5] - - [4096, 3969, 1, 1024] - - [877, 9640.05] + - [880, 9640.05] - - [4096, 3466, 1, 1024] - - [877, 9576.73] + - [880, 9576.73] - - [1024, 3999, 1, 1024] - - [880, 9207.05] + - [883, 9207.05] - - [1024, 4032, 1, 1024] - - [881, 9294.46] + - [884, 9294.46] - - [1024, 3403, 1, 4096] - - [895, 8357.87] + - [898, 8357.87] - - [4096, 3361, 1, 1024] - - [877, 9308.68] + - [880, 9308.68] - - [1024, 3527, 1, 4096] - - [896, 8512.09] + - [899, 8512.09] - - [1024, 3822, 1, 4096] - - [880, 8991.03] + - [883, 8991.03] - - [4096, 3315, 1, 1024] - - [875, 9834.86] + - [878, 9834.86] - - [232, 232, 272, 64] - - [909, 6481.52] + - [912, 6481.52] - - [1024, 3336, 1, 4096] - - [898, 8295.51] + - [901, 8295.51] - - [228, 232, 272, 64] - - [910, 6327.75] + - [913, 6327.75] - - [4096, 3547, 1, 1024] - - [875, 9781.46] + - [878, 9781.46] - - [4096, 3340, 1, 1024] - - [877, 9269.62] + - [880, 9269.62] - - [1024, 3906, 1, 1024] - - [881, 9018.28] + - [884, 9018.28] - - [1024, 3295, 1, 4096] - - [895, 8194.73] + - [898, 8194.73] - - [4096, 3294, 1, 1024] - - [878, 9762.06] + - [881, 9762.06] - - [33708, 3968, 1, 1024] - - [878, 10147.7] + - [881, 10147.7] - - [1024, 3473, 1, 4096] - - [884, 8318.58] + - [887, 8318.58] - - [1024, 3072, 1, 4096] - - [881, 9370.03] + - [884, 9370.03] - - [4096, 3189, 1, 1024] - - [875, 9470.16] + - [878, 9470.16] - - [4096, 3494, 1, 1024] - - [875, 9661.22] + - [878, 9661.22] - - [1024, 3522, 1, 4096] - - [898, 8459.13] + - [901, 8459.13] - - [33708, 3944, 1, 1024] - - [878, 10060.1] + - [881, 10060.1] - - [135, 135, 480, 64] - - [923, 4256.93] + - [926, 4256.93] - - [4096, 3421, 1, 1024] - - [875, 9456.88] + - [878, 9456.88] - - [32, 32, 1984, 64] - - [920, 3436.14] + - [923, 3436.14] - - [4096, 3311, 1, 1024] - - [875, 9810.78] + - [878, 9810.78] - - [1024, 3990, 1, 1024] - - [882, 9197.64] + - [885, 9197.64] - - [1024, 3290, 1, 4096] - - [895, 8229.53] + - [898, 8229.53] - - [4096, 3565, 1, 1024] - - [876, 9824.38] + - [879, 9824.38] - - [1024, 3484, 1, 4096] - - [885, 8575.28] + - [888, 8575.28] - - [4096, 3384, 1, 1024] - - [875, 9366.44] + - [878, 9366.44] - - [1024, 3422, 1, 4096] - - [895, 8484.02] + - [898, 8484.02] - - [4096, 3681, 1, 1024] - - [876, 9520.06] + - [879, 9520.06] - - [1024, 3584, 1, 1024] - - [902, 8583.27] + - [905, 8583.27] - - [4096, 4050, 1, 1024] - - [877, 9807.25] + - [880, 9807.25] - - [1024, 3996, 1, 4096] - - [878, 9181.6] + - [881, 9181.6] - - [4096, 3169, 1, 1024] - - [876, 9411.3] + - [879, 9411.3] - - [4096, 3538, 1, 1024] - - [876, 9765.89] + - [879, 9765.89] - - [1024, 3495, 1, 4096] - - [882, 8295.85] + - [885, 8295.85] - - [4096, 3401, 1, 1024] - - [875, 9402.58] + - [878, 9402.58] - - [1024, 3560, 1, 4096] - - [896, 8513.35] + - [899, 8513.35] - - [133, 135, 480, 64] - - [923, 4198.98] + - [926, 4198.98] - - [1024, 3263, 1, 4096] - - [897, 8172.13] + - [900, 8172.13] - - [1024, 3870, 1, 4096] - - [877, 8996.17] + - [880, 8996.17] - - [4096, 3555, 1, 1024] - - [878, 9811.78] + - [881, 9811.78] - - [4096, 3412, 1, 1024] - - [875, 9431.99] + - [878, 9431.99] - - [101, 101, 624, 64] - - [909, 4667.59] + - [912, 4667.59] - - [1024, 3296, 1, 4096] - - [896, 8350.51] + - [899, 8350.51] - - [1024, 3379, 1, 4096] - - [898, 8432.84] + - [901, 8432.84] - - [4096, 3302, 1, 1024] - - [875, 9796.29] + - [878, 9796.29] - - [1024, 3490, 1, 4096] - - [895, 8538.34] + - [898, 8538.34] - - [1024, 3428, 1, 4096] - - [896, 8531.57] + - [899, 8531.57] - - [1024, 3976, 1, 4096] - - [877, 9327.77] + - [880, 9327.77] - - [4096, 3485, 1, 1024] - - [875, 9628.72] + - [878, 9628.72] - - [4096, 3534, 1, 1024] - - [875, 9755.87] + - [878, 9755.87] - - [1024, 3064, 1, 4096] - - [881, 9196.88] + - [884, 9196.88] - - [4096, 3216, 1, 1024] - - [877, 9563.34] + - [880, 9563.34] - - [1024, 3450, 1, 4096] - - [905, 8519.19] + - [908, 8519.19] - - [1024, 3533, 1, 4096] - - [896, 8495.67] + - [899, 8495.67] - - [1024, 4030, 1, 1024] - - [881, 9304.58] + - [884, 9304.58] - - [1024, 3311, 1, 4096] - - [896, 8278.5] + - [899, 8278.5] - - [1024, 3468, 1, 4096] - - [887, 8564.45] + - [890, 8564.45] - - [23, 23, 2720, 64] - - [911, 2311.45] + - [914, 2311.45] - - [4096, 3359, 1, 1024] - - [877, 9309.05] + - [880, 9309.05] - - [4096, 3392, 1, 1024] - - [877, 9388.09] + - [880, 9388.09] - - [1024, 3925, 1, 1024] - - [879, 9006.62] + - [882, 9006.62] - - [4096, 3233, 1, 1024] - - [875, 9603.54] + - [878, 9603.54] - - [4096, 3956, 1, 1024] - - [876, 9581.84] + - [879, 9581.84] - - [1024, 3463, 1, 4096] - - [897, 8293.87] + - [900, 8293.87] - - [1024, 3126, 1, 4096] - - [896, 7978.03] + - [899, 7978.03] - - [1024, 3363, 1, 4096] - - [889, 8267.37] + - [892, 8267.37] - - [4096, 3465, 1, 1024] - - [875, 9590.64] + - [878, 9590.64] - - [33708, 3996, 1, 1024] - - [876, 9899.89] + - [879, 9899.89] - - [1024, 3231, 1, 4096] - - [897, 8231.58] + - [900, 8231.58] - - [33708, 3978, 1, 1024] - - [876, 9853.54] + - [879, 9853.54] - - [4096, 3476, 1, 1024] - - [875, 9616.52] + - [878, 9616.52] - - [85, 85, 752, 64] - - [907, 4240.55] + - [910, 4240.55] - - [4096, 3339, 1, 1024] - - [877, 9249.71] + - [880, 9249.71] - - [4096, 3452, 1, 1024] - - [875, 9534.03] + - [878, 9534.03] - - [1024, 3396, 1, 4096] - - [896, 8451.13] + - [899, 8451.13] - - [4096, 3293, 1, 1024] - - [877, 9775.12] + - [880, 9775.12] - - [54, 54, 1184, 64] - - [909, 4153.44] + - [912, 4153.44] - - [1024, 3432, 1, 4096] - - [890, 8345.43] + - [893, 8345.43] - - [4096, 3493, 1, 1024] - - [878, 9649.8] + - [881, 9649.8] - - [4096, 3350, 1, 1024] - - [877, 9273.81] + - [880, 9273.81] - - [1024, 3079, 1, 4096] - - [905, 7775.56] + - [908, 7775.56] - - [1024, 3101, 1, 4096] - - [905, 7847.75] + - [908, 7847.75] - - [33708, 3939, 1, 1024] - - [878, 10054.3] + - [881, 10054.3] - - [4096, 3256, 1, 1024] - - [877, 9681.73] + - [880, 9681.73] - - [1024, 3439, 1, 4096] - - [896, 8531.01] + - [899, 8531.01] - - [1024, 3510, 1, 4096] - - [895, 8422.21] + - [898, 8422.21] - - [4096, 3900, 1, 1024] - - [876, 9468.51] + - [879, 9468.51] - - [1024, 3470, 1, 4096] - - [897, 8507.67] + - [900, 8507.67] - - [4096, 3456, 1, 1024] - - [877, 9577.36] + - [880, 9577.36] - - [4096, 3014, 1, 1024] - - [876, 9666.05] + - [879, 9666.05] - - [4096, 3367, 1, 1024] - - [878, 9328.26] + - [881, 9328.26] - - [4096, 3432, 1, 1024] - - [875, 9480.78] + - [878, 9480.78] - - [33708, 4026, 1, 1024] - - [878, 9972.73] + - [881, 9972.73] - - [4096, 3273, 1, 1024] - - [875, 9716.85] + - [878, 9716.85] - - [4096, 3130, 1, 1024] - - [875, 9311.3] + - [878, 9311.3] - - [1024, 3496, 1, 4096] - - [886, 8434.55] + - [889, 8434.55] - - [1024, 3995, 1, 4096] - - [871, 9157.63] + - [874, 9157.63] - - [1024, 3939, 1, 4096] - - [879, 9059.76] + - [882, 9059.76] - - [1024, 3121, 1, 4096] - - [903, 7963.33] + - [906, 7963.33] - - [1024, 3232, 1, 4096] - - [897, 8060.99] + - [900, 8060.99] - - [4096, 3147, 1, 1024] - - [877, 9364.53] + - [880, 9364.53] - - [4096, 3516, 1, 1024] - - [875, 9708.74] + - [878, 9708.74] - - [1024, 3969, 1, 1024] - - [881, 9168.58] + - [884, 9168.58] - - [1024, 3364, 1, 4096] - - [885, 8363.55] + - [888, 8363.55] - - [4096, 3411, 1, 1024] - - [878, 9442.67] + - [881, 9442.67] - - [147, 147, 432, 64] - - [922, 4843.11] + - [925, 4843.11] - - [4096, 3301, 1, 1024] - - [877, 9783.36] + - [880, 9783.36] - - [112, 111, 576, 64] - - [909, 5627.37] + - [912, 5627.37] - - [1024, 3513, 1, 4096] - - [896, 8725.31] + - [899, 8725.31] - - [1024, 3469, 1, 4096] - - [876, 8183.01] + - [879, 8183.01] - - [1024, 3095, 1, 4096] - - [897, 7887.77] + - [900, 7887.77] - - [4096, 3533, 1, 1024] - - [876, 9755.17] + - [879, 9755.17] - - [4096, 3390, 1, 1024] - - [875, 9377.11] + - [878, 9377.11] - - [4096, 3582, 1, 1024] - - [875, 9874.86] + - [878, 9874.86] - - [1024, 3956, 1, 1024] - - [881, 9058.72] + - [884, 9058.72] - - [4096, 3585, 1, 1024] - - [877, 9289.65] + - [880, 9289.65] - - [4096, 3231, 1, 1024] - - [876, 9597.05] + - [879, 9597.05] - - [1024, 3205, 1, 4096] - - [895, 8073.15] + - [898, 8073.15] - - [4096, 3496, 1, 1024] - - [876, 9668.28] + - [879, 9668.28] - - [1024, 3143, 1, 4096] - - [895, 8031.58] + - [898, 8031.58] - - [1024, 3318, 1, 4096] - - [892, 8261.33] + - [895, 8261.33] - - [1024, 3353, 1, 4096] - - [896, 8414.82] + - [899, 8414.82] - - [1024, 3464, 1, 4096] - - [895, 8309.93] + - [898, 8309.93] - - [4096, 2736, 1, 1024] - - [877, 9563.02] + - [880, 9563.02] - - [1024, 3402, 1, 4096] - - [892, 8413.74] + - [895, 8413.74] - - [4096, 3138, 1, 1024] - - [877, 9341.99] + - [880, 9341.99] - - [1024, 3860, 1, 4096] - - [880, 9008.47] + - [883, 9008.47] - - [148, 148, 432, 64] - - [922, 4915.6] + - [925, 4915.6] - - [1024, 3539, 1, 4096] - - [892, 8449.26] + - [895, 8449.26] - - [4096, 3211, 1, 1024] - - [877, 9551.18] + - [880, 9551.18] - - [1024, 3332, 1, 4096] - - [885, 8295.01] + - [888, 8295.01] - - [1024, 3466, 1, 4096] - - [896, 8339.15] + - [899, 8339.15] - - [4096, 3475, 1, 1024] - - [875, 9612.23] + - [878, 9612.23] - - [4096, 3524, 1, 1024] - - [878, 9722.64] + - [881, 9722.64] - - [4096, 2985, 1, 1024] - - [878, 9591.23] + - [881, 9591.23] - - [4096, 3222, 1, 1024] - - [875, 9577.38] + - [878, 9577.38] - - [4096, 3451, 1, 1024] - - [877, 9541.32] + - [880, 9541.32] - - [1024, 3181, 1, 4096] - - [895, 8118.79] + - [898, 8118.79] - - [1024, 3640, 1, 4096] - - [880, 8617.01] + - [883, 8617.01] - - [1024, 3375, 1, 4096] - - [884, 8419.65] + - [887, 8419.65] - - [1024, 3550, 1, 4096] - - [897, 8512.73] + - [900, 8512.73] - - [1024, 4020, 1, 1024] - - [881, 9266.8] + - [884, 9266.8] - - [1024, 3840, 1, 4096] - - [880, 8983.39] + - [883, 8983.39] - - [4096, 3349, 1, 1024] - - [875, 9279.86] + - [878, 9279.86] - - [4096, 3398, 1, 1024] - - [876, 9402.22] + - [879, 9402.22] - - [33708, 3976, 1, 1024] - - [877, 9849.44] + - [880, 9849.44] - - [1024, 2917, 1, 4096] - - [882, 8936.77] + - [885, 8936.77] - - [33708, 3910, 1, 1024] - - [875, 9983.25] + - [878, 9983.25] - - [4096, 3860, 1, 1024] - - [876, 9377.48] + - [879, 9377.48] - - [4096, 3304, 1, 1024] - - [878, 9798.34] + - [881, 9798.34] - - [1024, 3286, 1, 4096] - - [883, 8167.31] + - [886, 8167.31] - - [1024, 3460, 1, 4096] - - [893, 8539.46] + - [896, 8539.46] - - [1024, 4026, 1, 4096] - - [879, 9305.58] + - [882, 9305.58] - - [4096, 3471, 1, 1024] - - [877, 9596.61] + - [880, 9596.61] - - [193, 193, 320, 64] - - [925, 4758.36] + - [928, 4758.36] - - [1024, 3894, 1, 1024] - - [879, 8979.5] + - [882, 8979.5] - - [65, 65, 992, 64] - - [921, 2565.39] + - [924, 2565.39] - - [1024, 3506, 1, 4096] - - [893, 8593.12] + - [896, 8593.12] - - [35, 35, 1808, 64] - - [915, 2129.62] + - [918, 2129.62] - - [1024, 4000, 1, 1024] - - [879, 9204.5] + - [882, 9204.5] - - [1024, 3900, 1, 4096] - - [875, 9050.26] + - [878, 9050.26] - - [1024, 3445, 1, 4096] - - [898, 8551.55] + - [901, 8551.55] - - [4096, 3442, 1, 1024] - - [876, 9504.9] + - [879, 9504.9] - - [1024, 3358, 1, 4096] - - [897, 8437.06] + - [900, 8437.06] - - [13, 13, 4672, 64] - - [908, 860.565] + - [911, 860.565] - - [1024, 3211, 1, 4096] - - [901, 8085.15] + - [904, 8085.15] - - [4096, 3515, 1, 1024] - - [877, 9715.19] + - [880, 9715.19] - - [1024, 3564, 1, 4096] - - [883, 8760.27] + - [886, 8760.27] - - [4096, 3057, 1, 1024] - - [877, 9803.95] + - [880, 9803.95] - - [1024, 3343, 1, 4096] - - [895, 8363.7] + - [898, 8363.7] - - [4096, 3262, 1, 1024] - - [876, 9686.39] + - [879, 9686.39] - - [1024, 3518, 1, 4096] - - [895, 8454.95] + - [898, 8454.95] - - [77, 77, 816, 64] - - [914, 3505.84] + - [917, 3505.84] - - [33708, 3876, 1, 1024] - - [876, 9895.85] + - [879, 9895.85] - - [4096, 3462, 1, 1024] - - [877, 9570.21] + - [880, 9570.21] - - [1024, 3265, 1, 4096] - - [895, 8322.65] + - [898, 8322.65] - - [4096, 3389, 1, 1024] - - [876, 9382.76] + - [879, 9382.76] - - [4096, 3438, 1, 1024] - - [877, 9503.37] + - [880, 9503.37] - - [1024, 3955, 1, 1024] - - [879, 9064.35] + - [882, 9064.35] - - [1024, 3545, 1, 4096] - - [898, 8652.31] + - [901, 8652.31] - - [1024, 3144, 1, 4096] - - [898, 8060.45] + - [901, 8060.45] - - [1024, 3417, 1, 4096] - - [896, 8505.81] + - [899, 8505.81] - - [4096, 3543, 1, 1024] - - [875, 9775.57] + - [878, 9775.57] - - [4096, 3352, 1, 1024] - - [877, 9282.77] + - [880, 9282.77] - - [33708, 3975, 1, 1024] - - [878, 9849.39] + - [881, 9849.39] - - [148, 147, 432, 64] - - [922, 4876.05] + - [925, 4876.05] - - [4096, 3137, 1, 1024] - - [875, 9330.53] + - [878, 9330.53] - - [4096, 3506, 1, 1024] - - [878, 9682.66] + - [881, 9682.66] - - [1024, 3975, 1, 1024] - - [881, 9164.67] + - [884, 9164.67] - - [1024, 3859, 1, 4096] - - [879, 8983.74] + - [882, 8983.74] - - [4096, 3369, 1, 1024] - - [877, 9330.35] + - [880, 9330.35] - - [1024, 3434, 1, 4096] - - [895, 8486.88] + - [898, 8486.88] - - [1024, 3292, 1, 4096] - - [895, 8478.86] + - [898, 8478.86] - - [4096, 3523, 1, 1024] - - [875, 9734.73] + - [878, 9734.73] - - [4096, 3380, 1, 1024] - - [877, 9354.39] + - [880, 9354.39] - - [1024, 3408, 1, 4096] - - [898, 8440.93] + - [901, 8440.93] - - [4096, 3221, 1, 1024] - - [877, 9575.49] + - [880, 9575.49] - - [4096, 3270, 1, 1024] - - [877, 9717.85] + - [880, 9717.85] - - [143, 143, 432, 64] - - [923, 4643.35] + - [926, 4643.35] - - [111, 111, 576, 64] - - [915, 5474.94] + - [918, 5474.94] - - [1024, 3303, 1, 4096] - - [897, 8412.97] + - [900, 8412.97] - - [4096, 3502, 1, 1024] - - [877, 9679.77] + - [880, 9679.77] - - [1024, 3222, 1, 4096] - - [897, 8141.78] + - [900, 8141.78] - - [4096, 2505, 1, 1024] - - [875, 9594.85] + - [878, 9594.85] - - [4096, 3397, 1, 1024] - - [875, 9392.51] + - [878, 9392.51] - - [4096, 3562, 1, 1024] - - [875, 9827.48] + - [878, 9827.48] - - [4096, 3095, 1, 1024] - - [877, 9222.35] + - [880, 9222.35] - - [1024, 3226, 1, 4096] - - [893, 8026.93] + - [896, 8026.93] - - [177, 177, 352, 64] - - [910, 6406.86] + - [913, 6406.86] - - [4096, 3360, 1, 1024] - - [876, 9298.05] + - [879, 9298.05] - - [1024, 3942, 1, 1024] - - [881, 9061.49] + - [884, 9061.49] - - [1024, 3298, 1, 4096] - - [898, 8254.26] + - [901, 8254.26] - - [1024, 3381, 1, 4096] - - [897, 8508.71] + - [900, 8508.71] - - [4096, 3314, 1, 1024] - - [877, 9837.46] + - [880, 9837.46] - - [1024, 3492, 1, 4096] - - [885, 8583.29] + - [888, 8583.29] - - [1024, 3430, 1, 4096] - - [885, 8492.61] + - [888, 8492.61] - - [4096, 3977, 1, 1024] - - [877, 9656.35] + - [880, 9656.35] - - [4096, 3546, 1, 1024] - - [875, 9780.25] + - [878, 9780.25] - - [4096, 3640, 1, 1024] - - [875, 9415.41] + - [878, 9415.41] - - [4096, 3441, 1, 1024] - - [876, 9499.14] + - [879, 9499.14] - - [33708, 4059, 1, 1024] - - [878, 10051.8] + - [881, 10051.8] - - [1024, 3978, 1, 1024] - - [879, 9158.7] + - [882, 9158.7] - - [1024, 3376, 1, 4096] - - [897, 8415.34] + - [900, 8415.34] - - [1024, 3482, 1, 4096] - - [898, 8396.52] + - [901, 8396.52] - - [1024, 3563, 1, 4096] - - [881, 8424.08] + - [884, 8424.08] - - [4096, 4020, 1, 1024] - - [878, 9745.86] + - [881, 9745.86] - - [1024, 3271, 1, 4096] - - [896, 8289.58] + - [899, 8289.58] - - [1024, 3291, 1, 4096] - - [896, 8222.61] + - [899, 8222.61] - - [1024, 3431, 1, 4096] - - [891, 8464.3] + - [894, 8464.3] - - [1024, 3481, 1, 4096] - - [897, 8386.4] + - [900, 8386.4] - - [84, 85, 752, 64] - - [912, 4194.75] + - [915, 4194.75] - - [4096, 3461, 1, 1024] - - [875, 9579.57] + - [878, 9579.57] - - [1024, 3574, 1, 4096] - - [898, 8579.7] + - [901, 8579.7] - - [1024, 4059, 1, 1024] - - [879, 9330.44] + - [882, 9330.44] - - [84, 84, 752, 64] - - [919, 4141.36] + - [922, 4141.36] - - [1024, 3421, 1, 4096] - - [898, 8528.32] + - [901, 8528.32] - - [4096, 3224, 1, 1024] - - [877, 9589.85] + - [880, 9589.85] - - [4096, 3437, 1, 1024] - - [877, 9498.1] + - [880, 9498.1] - - [45, 45, 1424, 64] - - [909, 3314.48] + - [912, 3314.48] - - [4096, 3840, 1, 1024] - - [875, 9931.27] + - [878, 9931.27] - - [4096, 3168, 1, 1024] - - [877, 9412.06] + - [880, 9412.06] - - [33708, 3990, 1, 1024] - - [875, 9884.29] + - [878, 9884.29] - - [1024, 3349, 1, 4096] - - [897, 8421.3] + - [900, 8421.3] - - [4096, 3335, 1, 1024] - - [875, 9241.55] + - [878, 9241.55] - - [4096, 3400, 1, 1024] - - [877, 9407.25] + - [880, 9407.25] - - [160, 159, 400, 64] - - [924, 5708.84] + - [927, 5708.84] - - [1024, 3398, 1, 4096] - - [897, 8623.93] + - [900, 8623.93] - - [1024, 3780, 1, 4096] - - [877, 8756.68] + - [880, 8756.68] - - [29, 29, 2176, 64] - - [920, 2963.59] + - [923, 2963.59] - - [4096, 3098, 1, 1024] - - [875, 9229.72] + - [878, 9229.72] - - [1024, 4012, 1, 4096] - - [881, 9421.93] + - [884, 9421.93] - - [4096, 3505, 1, 1024] - - [877, 9687.55] + - [880, 9687.55] - - [4096, 3554, 1, 1024] - - [877, 9812.12] + - [880, 9812.12] - - [4096, 3063, 1, 1024] - - [877, 9825.0] + - [880, 9825.0] - - [1024, 3503, 1, 4096] - - [895, 8404.64] + - [898, 8404.64] - - [1024, 3166, 1, 4096] - - [898, 8084.83] + - [901, 8084.83] - - [1024, 3425, 1, 4096] - - [898, 8537.48] + - [901, 8537.48] - - [1024, 3344, 1, 4096] - - [889, 8351.06] + - [892, 8351.06] - - [4096, 3484, 1, 1024] - - [877, 9635.6] + - [880, 9635.6] - - [1024, 3681, 1, 1024] - - [880, 8457.08] + - [883, 8457.08] - - [1024, 4050, 1, 1024] - - [881, 9326.11] + - [884, 9326.11] - - [4096, 3379, 1, 1024] - - [875, 9356.06] + - [878, 9356.06] - - [4096, 3428, 1, 1024] - - [876, 9472.23] + - [879, 9472.23] - - [12, 12, 5040, 64] - - [914, 741.517] + - [917, 741.517] - - [27, 27, 2336, 64] - - [920, 2757.8] + - [923, 2757.8] - - [1024, 3304, 1, 4096] - - [898, 8317.72] + - [901, 8317.72] - - [1024, 3387, 1, 4096] - - [896, 8460.05] + - [899, 8460.05] - - [4096, 3126, 1, 1024] - - [878, 9308.38] + - [881, 9308.38] - - [1024, 3498, 1, 4096] - - [895, 8485.45] + - [898, 8485.45] - - [1024, 3436, 1, 4096] - - [897, 8397.61] + - [900, 8397.61] - - [4096, 3501, 1, 1024] - - [875, 9681.09] + - [878, 9681.09] - - [4096, 3358, 1, 1024] - - [877, 9304.8] + - [880, 9304.8] - - [4096, 3232, 1, 1024] - - [875, 9607.1] + - [878, 9607.1] - - [1024, 3585, 1, 4096] - - [879, 8510.64] + - [882, 8510.64] - - [4096, 3143, 1, 1024] - - [878, 9355.81] + - [881, 9355.81] - - [4096, 3464, 1, 1024] - - [877, 9585.85] + - [880, 9585.85] - - [1024, 3366, 1, 4096] - - [885, 8275.13] + - [888, 8275.13] - - [4096, 3375, 1, 1024] - - [875, 9342.03] + - [878, 9342.03] - - [4096, 2917, 1, 1024] - - [875, 9372.74] + - [878, 9372.74] - - [4096, 4026, 1, 1024] - - [877, 9759.05] + - [880, 9759.05] - - [49, 49, 1296, 64] - - [916, 3709.92] + - [919, 3709.92] - - [1024, 3277, 1, 4096] - - [896, 8217.0] + - [899, 8217.0] - - [1024, 3103, 1, 4096] - - [897, 7872.57] + - [900, 7872.57] - - [33708, 3995, 1, 1024] - - [877, 9892.98] + - [880, 9892.98] - - [1024, 3297, 1, 4096] - - [896, 8185.72] + - [899, 8185.72] - - [4096, 3545, 1, 1024] - - [877, 9789.33] + - [880, 9789.33] - - [1024, 3399, 1, 4096] - - [896, 8377.08] + - [899, 8377.08] - - [33708, 3796, 1, 1024] - - [876, 10007.9] + - [879, 10007.9] - - [4096, 3292, 1, 1024] - - [877, 9767.18] + - [880, 9767.18] - - [71, 71, 896, 64] - - [911, 3006.15] + - [914, 3006.15] - - [33708, 3859, 1, 1024] - - [878, 9860.27] + - [881, 9860.27] - - [4096, 3566, 1, 1024] - - [877, 9834.37] + - [880, 9834.37] - - [4096, 3894, 1, 1024] - - [875, 9456.57] + - [878, 9456.57] - - [4096, 3492, 1, 1024] - - [875, 9653.14] + - [878, 9653.14] - - [1024, 3977, 1, 1024] - - [881, 9161.23] + - [884, 9161.23] - - [1024, 3272, 1, 4096] - - [898, 8256.99] + - [901, 8256.99] - - [135, 134, 480, 64] - - [922, 4238.29] + - [925, 4238.29] - - [1024, 3355, 1, 4096] - - [896, 8374.54] + - [899, 8374.54] - - [4096, 3419, 1, 1024] - - [878, 9455.34] + - [881, 9455.34] - - [1024, 3404, 1, 4096] - - [897, 8580.18] + - [900, 8580.18] - - [4096, 3999, 1, 1024] - - [877, 9701.68] + - [880, 9701.68] - - [4096, 3166, 1, 1024] - - [875, 9410.38] + - [878, 9410.38] - - [33708, 3840, 1, 1024] - - [878, 10132.8] + - [881, 10132.8] - - [4096, 4032, 1, 1024] - - [878, 9762.76] + - [881, 9762.76] - - [1024, 3573, 1, 4096] - - [896, 8603.3] + - [899, 8603.3] - - [4096, 3366, 1, 1024] - - [878, 9322.53] + - [881, 9322.53] - - [1024, 3541, 1, 4096] - - [898, 8405.8] + - [901, 8405.8] - - [4096, 3207, 1, 1024] - - [875, 9544.15] + - [878, 9544.15] - - [4096, 3272, 1, 1024] - - [877, 9716.63] + - [880, 9716.63] - - [1024, 3334, 1, 4096] - - [895, 8241.29] + - [898, 8241.29] - - [228, 228, 272, 64] - - [910, 6232.35] + - [913, 6232.35] - - [4096, 3183, 1, 1024] - - [877, 9452.34] + - [880, 9452.34] - - [4096, 3536, 1, 1024] - - [876, 9759.34] + - [879, 9759.34] - - [1024, 4005, 1, 1024] - - [880, 9225.73] + - [883, 9225.73] - - [1024, 3245, 1, 4096] - - [897, 8074.21] + - [900, 8074.21] - - [4096, 3447, 1, 1024] - - [876, 9525.74] + - [879, 9525.74] - - [1024, 3183, 1, 4096] - - [896, 8121.52] + - [899, 8121.52] - - [1024, 3361, 1, 4096] - - [898, 8285.76] + - [901, 8285.76] - - [33708, 3870, 1, 1024] - - [876, 9879.25] + - [879, 9879.25] - - [1024, 3321, 1, 4096] - - [897, 8408.57] + - [900, 8408.57] - - [1024, 3968, 1, 1024] - - [879, 9201.95] + - [882, 9201.95] - - [1024, 3486, 1, 4096] - - [893, 8258.79] + - [896, 8258.79] - - [4096, 4005, 1, 1024] - - [877, 9723.88] + - [880, 9723.88] - - [4096, 3410, 1, 1024] - - [878, 9440.4] + - [881, 9440.4] - - [1024, 3944, 1, 1024] - - [881, 9040.72] + - [884, 9040.72] - - [4096, 3300, 1, 1024] - - [876, 9789.8] + - [879, 9789.8] - - [4096, 3579, 1, 1024] - - [878, 9859.34] + - [881, 9859.34] - - [4096, 3483, 1, 1024] - - [878, 9624.21] + - [881, 9624.21] - - [4096, 3532, 1, 1024] - - [877, 9742.66] + - [880, 9742.66] - - [1024, 3140, 1, 4096] - - [897, 7899.55] + - [900, 7899.55] - - [1024, 3372, 1, 4096] - - [895, 8236.97] + - [898, 8236.97] - - [1024, 3224, 1, 4096] - - [898, 8159.03] + - [901, 8159.03] - - [4096, 3230, 1, 1024] - - [877, 9601.15] + - [880, 9601.15] - - [4096, 3427, 1, 1024] - - [877, 9466.47] + - [880, 9466.47] - - [1024, 3796, 1, 1024] - - [881, 8739.68] + - [884, 8739.68] - - [143, 148, 432, 64] - - [922, 4761.9] + - [925, 4761.9] - - [1024, 3616, 1, 4096] - - [880, 8445.79] + - [883, 8445.79] - - [1024, 3315, 1, 4096] - - [897, 8403.11] + - [900, 8403.11] - - [1024, 3476, 1, 4096] - - [895, 8523.58] + - [898, 8523.58] - - [1024, 3509, 1, 4096] - - [895, 8344.95] + - [898, 8344.95] - - [4096, 3357, 1, 1024] - - [877, 9300.06] + - [880, 9300.06] - - [4096, 3406, 1, 1024] - - [877, 9427.34] + - [880, 9427.34] - - [1024, 3558, 1, 4096] - - [896, 8525.68] + - [899, 8525.68] - - [4096, 3593, 1, 1024] - - [877, 9302.1] + - [880, 9302.1] - - [4096, 3247, 1, 1024] - - [877, 9648.4] + - [880, 9648.4] - - [4096, 3088, 1, 1024] - - [877, 9204.11] + - [880, 9204.11] - - [1024, 3213, 1, 4096] - - [895, 8054.21] + - [898, 8054.21] - - [4096, 3511, 1, 1024] - - [875, 9702.6] + - [878, 9702.6] - - [122, 122, 528, 64] - - [916, 6293.29] + - [919, 6293.29] - - [1024, 3365, 1, 4096] - - [892, 8413.52] + - [895, 8413.52] - - [1024, 3504, 1, 4096] - - [894, 8414.36] + - [897, 8414.36] - - [1024, 3442, 1, 4096] - - [897, 8683.9] + - [900, 8683.9] - - [4096, 3474, 1, 1024] - - [875, 9611.5] + - [878, 9611.5] - - [4096, 2984, 1, 1024] - - [876, 9592.72] + - [879, 9592.72] - - [1024, 3876, 1, 4096] - - [879, 9085.85] + - [882, 9085.85] - - [4096, 3337, 1, 1024] - - [877, 9246.12] + - [880, 9246.12] - - [4096, 3450, 1, 1024] - - [877, 9534.53] + - [880, 9534.53] - - [1024, 3547, 1, 4096] - - [897, 8386.63] + - [900, 8386.63] - - [4096, 3291, 1, 1024] - - [876, 9759.24] + - [879, 9759.24] - - [1024, 3340, 1, 4096] - - [896, 8237.87] + - [899, 8237.87] - - [4096, 3491, 1, 1024] - - [877, 9656.49] + - [880, 9656.49] - - [4096, 3348, 1, 1024] - - [877, 9279.05] + - [880, 9279.05] - - [78, 78, 816, 64] - - [917, 3590.99] + - [920, 3590.99] - - [4096, 3968, 1, 1024] - - [878, 9642.09] + - [881, 9642.09] - - [4096, 3906, 1, 1024] - - [878, 9485.27] + - [881, 9485.27] - - [1024, 3477, 1, 4096] - - [885, 8389.1] + - [888, 8389.1] - - [1024, 3397, 1, 4096] - - [895, 8556.78] + - [898, 8556.78] - - [4096, 3165, 1, 1024] - - [876, 9415.42] + - [879, 9415.42] - - [4096, 3470, 1, 1024] - - [875, 9598.4] + - [878, 9598.4] - - [1024, 3526, 1, 4096] - - [895, 8442.05] + - [898, 8442.05] - - [112, 112, 576, 64] - - [910, 5672.5] + - [913, 5672.5] - - [4096, 3365, 1, 1024] - - [875, 9321.73] + - [878, 9321.73] - - [4096, 3319, 1, 1024] - - [875, 9838.38] + - [878, 9838.38] - - [1024, 3401, 1, 4096] - - [897, 8460.76] + - [900, 8460.76] - - [1024, 3294, 1, 4096] - - [896, 8324.53] + - [899, 8324.53] - - [159, 159, 400, 64] - - [912, 5488.41] + - [915, 5488.41] - - [1024, 3472, 1, 4096] - - [890, 8289.67] + - [893, 8289.67] - - [4096, 3328, 1, 1024] - - [876, 9904.25] + - [879, 9904.25] - - [1024, 3861, 1, 1024] - - [881, 8917.53] + - [884, 8917.53] - - [1024, 3910, 1, 1024] - - [879, 9010.06] + - [882, 9010.06] - - [1024, 3410, 1, 4096] - - [897, 8519.53] + - [900, 8519.53] - - [1024, 3395, 1, 4096] - - [895, 8424.25] + - [898, 8424.25] - - [4096, 3282, 1, 1024] - - [875, 9743.57] + - [878, 9743.57] - - [1024, 3751, 1, 1024] - - [882, 8680.29] + - [885, 8680.29] - - [4096, 3145, 1, 1024] - - [877, 9353.27] + - [880, 9353.27] - - [4096, 3514, 1, 1024] - - [877, 9712.94] + - [880, 9712.94] - - [4096, 3944, 1, 1024] - - [877, 9563.82] + - [880, 9563.82] - - [1024, 3515, 1, 4096] - - [896, 8428.03] + - [899, 8428.03] - - [4096, 3409, 1, 1024] - - [876, 9428.67] + - [879, 9428.67] - - [4096, 3564, 1, 1024] - - [875, 9823.69] + - [878, 9823.69] - - [4096, 3299, 1, 1024] - - [877, 9792.93] + - [880, 9792.93] - - [1024, 3057, 1, 4096] - - [873, 9237.75] + - [876, 9237.75] - - [4096, 3531, 1, 1024] - - [875, 9745.54] + - [878, 9745.54] - - [4096, 3388, 1, 1024] - - [877, 9374.55] + - [880, 9374.55] - - [1024, 3189, 1, 4096] - - [897, 8084.5] + - [900, 8084.5] - - [1024, 3300, 1, 4096] - - [897, 8185.03] + - [900, 8185.03] - - [1024, 3720, 1, 4096] - - [876, 8755.01] + - [879, 8755.01] - - [1024, 3383, 1, 4096] - - [890, 8463.37] + - [893, 8463.37] - - [1024, 3494, 1, 4096] - - [897, 8676.47] + - [900, 8676.47] - - [77, 78, 816, 64] - - [913, 3548.16] + - [916, 3548.16] - - [1024, 3448, 1, 4096] - - [895, 8665.68] + - [898, 8665.68] - - [4096, 3542, 1, 1024] - - [875, 9771.78] + - [878, 9771.78] - - [1024, 3488, 1, 4096] - - [895, 8488.29] + - [898, 8488.29] - - [4096, 3405, 1, 1024] - - [877, 9426.06] + - [880, 9426.06] - - [1024, 3262, 1, 4096] - - [897, 8206.87] + - [900, 8206.87] - - [33708, 4005, 1, 1024] - - [878, 9928.06] + - [881, 9928.06] - - [1024, 3594, 1, 4096] - - [882, 8458.47] + - [885, 8458.47] - - [4096, 3103, 1, 1024] - - [878, 9243.04] + - [881, 9243.04] - - [4096, 3136, 1, 1024] - - [877, 9340.8] + - [880, 9340.8] - - [1024, 3378, 1, 4096] - - [898, 8432.35] + - [901, 8432.35] - - [10, 10, 5952, 64] - - [918, 523.253] + - [921, 523.253] - - [7, 7, 8192, 64] - - [918, 260.443] + - [921, 260.443] - - [4096, 3559, 1, 1024] - - [877, 9813.0] + - [880, 9813.0] - - [4096, 3368, 1, 1024] - - [878, 9328.56] + - [881, 9328.56] - - [4096, 3209, 1, 1024] - - [875, 9538.73] + - [878, 9538.73] - - [4096, 3322, 1, 1024] - - [877, 9839.48] + - [880, 9839.48] - - [1024, 3483, 1, 4096] - - [883, 8348.25] + - [886, 8348.25] - - [4096, 3473, 1, 1024] - - [876, 9605.69] + - [879, 9605.69] - - [4096, 3522, 1, 1024] - - [878, 9729.92] + - [881, 9729.92] - - [1024, 3532, 1, 4096] - - [896, 8474.22] + - [899, 8474.22] - - [4096, 3449, 1, 1024] - - [877, 9528.25] + - [880, 9528.25] - - [1024, 3351, 1, 4096] - - [898, 8311.13] + - [901, 8311.13] - - [1024, 3462, 1, 4096] - - [895, 8297.54] + - [898, 8297.54] - - [4096, 3396, 1, 1024] - - [877, 9400.15] + - [880, 9400.15] - - [132, 132, 480, 64] - - [923, 4089.74] + - [926, 4089.74] - - [111, 112, 576, 64] - - [909, 5529.6] + - [912, 5529.6] - - [1024, 3416, 1, 4096] - - [896, 8556.54] + - [899, 8556.54] - - [4096, 3469, 1, 1024] - - [878, 9598.67] + - [881, 9598.67] - - [1024, 3582, 1, 4096] - - [879, 8461.37] + - [882, 8461.37] - - [1024, 3230, 1, 4096] - - [896, 8188.84] + - [899, 8188.84] - - [1024, 3489, 1, 4096] - - [897, 8457.75] + - [900, 8457.75] - - [1024, 3427, 1, 4096] - - [897, 8566.49] + - [900, 8566.49] - - [1024, 3346, 1, 4096] - - [896, 8352.07] + - [899, 8352.07] - - [33708, 3977, 1, 1024] - - [878, 9868.4] + - [881, 9868.4] - - [4096, 3796, 1, 1024] - - [877, 9797.66] + - [880, 9797.66] - - [4096, 3176, 1, 1024] - - [877, 9435.29] + - [880, 9435.29] - - [4096, 3990, 1, 1024] - - [875, 9672.23] + - [878, 9672.23] - - [1024, 3257, 1, 4096] - - [898, 8225.07] + - [901, 8225.07] - - [4096, 3343, 1, 1024] - - [899, 9273.52] + - [902, 9273.52] - - [4096, 3440, 1, 1024] - - [875, 9501.38] + - [878, 9501.38] - - [33708, 4030, 1, 1024] - - [876, 9983.26] + - [879, 9983.26] - - [1024, 3190, 1, 4096] - - [897, 8192.01] + - [900, 8192.01] - - [1024, 3389, 1, 4096] - - [898, 8439.32] + - [901, 8439.32] - - [1024, 3500, 1, 4096] - - [896, 8556.02] + - [899, 8556.02] - - [1024, 3471, 1, 4096] - - [885, 8491.07] + - [888, 8491.07] - - [1024, 3438, 1, 4096] - - [898, 8567.85] + - [901, 8567.85] - - [4096, 3513, 1, 1024] - - [875, 9710.17] + - [878, 9710.17] - - [1024, 3562, 1, 4096] - - [890, 8608.84] + - [893, 8608.84] - - [4096, 3616, 1, 1024] - - [877, 9357.49] + - [880, 9357.49] - - [4096, 3955, 1, 1024] - - [876, 9589.61] + - [879, 9589.61] - - [1024, 3441, 1, 4096] - - [886, 8359.17] + - [889, 8359.17] - - [1024, 3236, 1, 4096] - - [900, 8022.5] + - [903, 8022.5] - - [1024, 3524, 1, 4096] - - [895, 8477.14] + - [898, 8477.14] - - [4096, 3460, 1, 1024] - - [875, 9581.86] + - [878, 9581.86] - - [16, 16, 3840, 64] - - [907, 1270.49] + - [910, 1270.49] - - [92, 93, 688, 64] - - [911, 4962.3] + - [914, 4962.3] - - [1024, 3384, 1, 4096] - - [886, 8409.29] + - [889, 8409.29] - - [4096, 3387, 1, 1024] - - [877, 9379.7] + - [880, 9379.7] - - [4096, 3436, 1, 1024] - - [875, 9491.83] + - [878, 9491.83] - - [4096, 3277, 1, 1024] - - [875, 9717.17] + - [878, 9717.17] - - [1024, 3457, 1, 4096] - - [895, 8279.12] + - [898, 8279.12] - - [1024, 3999, 1, 4096] - - [870, 9231.37] + - [873, 9231.37] - - [1024, 4032, 1, 4096] - - [879, 9443.52] + - [882, 9443.52] - - [4096, 3541, 1, 1024] - - [875, 9773.14] + - [878, 9773.14] - - [4096, 3334, 1, 1024] - - [875, 9242.69] + - [878, 9242.69] - - [1024, 3393, 1, 4096] - - [897, 8376.07] + - [900, 8376.07] - - [17, 17, 3632, 64] - - [919, 1425.67] + - [922, 1425.67] - - [1024, 3411, 1, 4096] - - [885, 8490.87] + - [888, 8490.87] - - [1024, 3822, 1, 1024] - - [882, 8773.34] + - [885, 8773.34] - - [1024, 3593, 1, 4096] - - [882, 8571.15] + - [885, 8571.15] - - [33708, 3822, 1, 1024] - - [876, 10056.7] + - [879, 10056.7] - - [4096, 3504, 1, 1024] - - [878, 9680.19] + - [881, 9680.19] - - [1024, 3163, 1, 4096] - - [897, 8014.33] + - [900, 8014.33] - - [1024, 3357, 1, 4096] - - [898, 8375.94] + - [901, 8375.94] - - [1024, 3906, 1, 4096] - - [879, 9108.12] + - [882, 9108.12] - - [4096, 3415, 1, 1024] - - [875, 9443.77] + - [878, 9443.77] - - [1024, 3406, 1, 4096] - - [898, 8451.54] + - [901, 8451.54] - - [4096, 3321, 1, 1024] - - [877, 9836.52] + - [880, 9836.52] - - [4096, 3584, 1, 1024] - - [878, 9915.83] + - [881, 9915.83] - - [1024, 2736, 1, 4096] - - [881, 8532.83] + - [884, 8532.83] - - [1024, 3110, 1, 4096] - - [898, 7889.19] + - [901, 7889.19] - - [33708, 3999, 1, 1024] - - [878, 9903.23] + - [881, 9903.23] - - [1024, 3093, 1, 4096] - - [896, 7919.25] + - [899, 7919.25] - - [4096, 3378, 1, 1024] - - [878, 9362.2] + - [881, 9362.2] - - [1024, 3543, 1, 4096] - - [892, 8438.06] + - [895, 8438.06] - - [33708, 3925, 1, 1024] - - [877, 10021.5] + - [880, 10021.5] - - [1024, 3352, 1, 4096] - - [898, 8333.72] + - [901, 8333.72] - - [4096, 3780, 1, 1024] - - [875, 9754.92] + - [878, 9754.92] - - [1024, 3990, 1, 4096] - - [872, 9250.92] + - [875, 9250.92] - - [4096, 3500, 1, 1024] - - [875, 9673.73] + - [878, 9673.73] - - [4096, 3996, 1, 1024] - - [876, 9694.4] + - [879, 9694.4] - - [1024, 3247, 1, 4096] - - [901, 8171.48] + - [904, 8171.48] - - [4096, 3395, 1, 1024] - - [877, 9391.94] + - [880, 9391.94] - - [1024, 3169, 1, 4096] - - [896, 7990.14] + - [899, 7990.14] - - [1024, 3088, 1, 4096] - - [896, 7890.26] + - [899, 7890.26] - - [1024, 3584, 1, 4096] - - [898, 8604.1] + - [901, 8604.1] - - [4096, 3093, 1, 1024] - - [877, 9224.78] + - [880, 9224.78] - - [1024, 3538, 1, 4096] - - [879, 8395.64] + - [882, 8395.64] - - [1024, 3996, 1, 1024] - - [880, 9208.23] + - [883, 9208.23] - - [1024, 3581, 1, 4096] - - [892, 8523.14] + - [895, 8523.14] - - [4096, 3374, 1, 1024] - - [877, 9342.71] + - [880, 9342.71] - - [33708, 3751, 1, 1024] - - [877, 9881.89] + - [880, 9881.89] - - [59, 59, 1088, 64] - - [915, 4515.44] + - [918, 4515.44] - - [4096, 3215, 1, 1024] - - [877, 9557.65] + - [880, 9557.65] - - [4096, 3312, 1, 1024] - - [875, 9834.3] + - [878, 9834.3] - - [4096, 3581, 1, 1024] - - [877, 9856.56] + - [880, 9856.56] - - [4096, 3479, 1, 1024] - - [877, 9620.25] + - [880, 9620.25] - - [4096, 3544, 1, 1024] - - [875, 9778.84] + - [878, 9778.84] - - [1024, 3870, 1, 1024] - - [880, 8935.16] + - [883, 8935.16] - - [1024, 3374, 1, 4096] - - [897, 8412.75] + - [900, 8412.75] - - [1024, 2967, 1, 4096] - - [880, 8982.87] + - [883, 8982.87] - - [41, 41, 1552, 64] - - [909, 2805.28] + - [912, 2805.28] - - [4096, 3455, 1, 1024] - - [875, 9538.79] + - [878, 9538.79] - - [4096, 3942, 1, 1024] - - [876, 9554.55] + - [879, 9554.55] - - [1024, 3528, 1, 4096] - - [895, 8438.37] + - [898, 8438.37] - - [4096, 3186, 1, 1024] - - [876, 9468.22] + - [879, 9468.22] - - [1024, 3976, 1, 1024] - - [880, 9166.98] + - [883, 9166.98] - - [1024, 3511, 1, 4096] - - [882, 8334.96] + - [885, 8334.96] - - [4096, 3573, 1, 1024] - - [875, 9855.23] + - [878, 9855.23] - - [4096, 3561, 1, 1024] - - [875, 9830.93] + - [878, 9830.93] - - [4096, 3418, 1, 1024] - - [876, 9450.58] + - [879, 9450.58] - - [33708, 3906, 1, 1024] - - [878, 9973.57] + - [881, 9973.57] - - [4096, 3259, 1, 1024] - - [875, 9685.16] + - [878, 9685.16] - - [4096, 3308, 1, 1024] - - [877, 9791.93] + - [880, 9791.93] - - [1024, 3419, 1, 4096] - - [897, 8514.43] + - [900, 8514.43] - - [1024, 3215, 1, 4096] - - [896, 8137.43] + - [899, 8137.43] - - [1024, 4030, 1, 4096] - - [878, 9290.66] + - [881, 9290.66] - - [4096, 3459, 1, 1024] - - [875, 9567.47] + - [878, 9567.47] - - [1024, 3572, 1, 4096] - - [895, 8501.33] + - [898, 8501.33] - - [1024, 3137, 1, 4096] - - [897, 7930.05] + - [900, 7930.05] - - [1024, 3312, 1, 4096] - - [898, 8378.5] + - [901, 8378.5] - - [1024, 3925, 1, 4096] - - [880, 9255.76] + - [883, 9255.76] - - [1024, 3453, 1, 4096] - - [897, 8630.66] + - [900, 8630.66] - - [4096, 3435, 1, 1024] - - [876, 9495.08] + - [879, 9495.08] - - [1024, 3176, 1, 4096] - - [897, 8087.13] + - [900, 8087.13] - - [1024, 3444, 1, 4096] - - [889, 8528.48] + - [892, 8528.48] - - [4096, 3975, 1, 1024] - - [878, 9645.24] + - [881, 9645.24] - - [4096, 3182, 1, 1024] - - [877, 9448.3] + - [880, 9448.3] - - [1024, 3475, 1, 4096] - - [896, 8404.77] + - [899, 8404.77] - - [9, 9, 6544, 64] - - [911, 425.754] + - [914, 425.754] - - [33708, 3955, 1, 1024] - - [878, 10088.3] + - [881, 10088.3] - - [4096, 3446, 1, 1024] - - [877, 9519.96] + - [880, 9519.96] - - [1024, 3138, 1, 4096] - - [896, 8053.34] + - [899, 8053.34] - - [1024, 3549, 1, 4096] - - [882, 8426.32] + - [885, 8426.32] - - [4096, 3287, 1, 1024] - - [878, 9751.24] + - [881, 9751.24] - - [1024, 3342, 1, 4096] - - [895, 8319.91] + - [898, 8319.91] - - [102, 102, 624, 64] - - [910, 4747.42] + - [913, 4747.42] - - [4096, 3519, 1, 1024] - - [877, 9716.0] + - [880, 9716.0] - - [4096, 3552, 1, 1024] - - [875, 9806.59] + - [878, 9806.59] - - [4096, 3859, 1, 1024] - - [875, 9369.84] + - [878, 9369.84] - - [33708, 3969, 1, 1024] - - [875, 9830.29] + - [878, 9830.29] - - [1024, 3369, 1, 4096] - - [896, 8379.16] + - [899, 8379.16] - - [4096, 3482, 1, 1024] - - [875, 9631.6] + - [878, 9631.6] - - [1024, 3306, 1, 4096] - - [898, 8319.96] + - [901, 8319.96] - - [1024, 3474, 1, 4096] - - [897, 8498.8] + - [900, 8498.8] - - [99, 99, 624, 64] - - [909, 4492.8] + - [912, 4492.8] - - [4096, 3377, 1, 1024] - - [875, 9369.82] + - [878, 9369.82] - - [4096, 3426, 1, 1024] - - [875, 9467.2] + - [878, 9467.2] - - [4096, 2935, 1, 1024] - - [876, 9423.64] + - [879, 9423.64] - - [4096, 3267, 1, 1024] - - [875, 9697.94] + - [878, 9697.94] - - [1024, 3299, 1, 4096] - - [896, 8264.66] + - [899, 8264.66] - - [1024, 3456, 1, 4096] - - [895, 8678.29] + - [898, 8678.29] - - [1024, 3280, 1, 4096] - - [896, 8220.59] + - [899, 8220.59] - - [1024, 3555, 1, 4096] - - [895, 8656.17] + - [898, 8656.17] - - [4096, 3499, 1, 1024] - - [877, 9663.83] + - [880, 9663.83] - - [4096, 3356, 1, 1024] - - [877, 9296.8] + - [880, 9296.8] - - [100, 102, 624, 64] - - [910, 4671.41] + - [913, 4671.41] - - [1024, 3412, 1, 4096] - - [898, 8537.95] + - [901, 8537.95] - - [1024, 2984, 1, 4096] - - [881, 9193.07] + - [884, 9193.07] - - [4096, 3141, 1, 1024] - - [877, 9349.33] + - [880, 9349.33] - - [4096, 3510, 1, 1024] - - [875, 9701.88] + - [878, 9701.88] - - [1024, 3995, 1, 1024] - - [879, 9243.3] + - [882, 9243.3] - - [1024, 3517, 1, 4096] - - [897, 8569.21] + - [900, 8569.21] - - [1024, 3455, 1, 4096] - - [897, 8560.57] + - [900, 8560.57] - - [1024, 3939, 1, 1024] - - [880, 9030.84] + - [883, 9030.84] - - [38, 38, 1680, 64] - - [909, 2459.74] + - [912, 2459.74] - - [1024, 3447, 1, 4096] - - [895, 8609.92] + - [898, 8609.92] - - [1024, 3969, 1, 4096] - - [882, 9097.23] + - [885, 9097.23] - - [4096, 3527, 1, 1024] - - [877, 9743.73] + - [880, 9743.73] - - [4096, 3336, 1, 1024] - - [877, 9248.23] + - [880, 9248.23] - - [1024, 3191, 1, 4096] - - [895, 8104.86] + - [898, 8104.86] - - [1024, 3302, 1, 4096] - - [896, 8244.99] + - [899, 8244.99] - - [1024, 3337, 1, 4096] - - [898, 8254.15] + - [901, 8254.15] - - [4096, 3290, 1, 1024] - - [877, 9759.03] + - [880, 9759.03] - - [1024, 3512, 1, 4096] - - [886, 8640.96] + - [889, 8640.96] - - [1024, 3433, 1, 4096] - - [896, 8444.6] + - [899, 8444.6] - - [4096, 3876, 1, 1024] - - [876, 9420.28] + - [879, 9420.28] - - [4096, 3490, 1, 1024] - - [877, 9641.01] + - [880, 9641.01] - - [4096, 3064, 1, 1024] - - [877, 9820.39] + - [880, 9820.39] - - [1024, 3508, 1, 4096] - - [892, 8442.14] + - [895, 8442.14] - - [1024, 3956, 1, 4096] - - [877, 9128.09] + - [880, 9128.09] - - [4096, 3417, 1, 1024] - - [877, 9448.31] + - [880, 9448.31] - - [1024, 3248, 1, 4096] - - [896, 8006.06] + - [899, 8006.06] - - [1024, 2499, 1, 4096] - - [896, 8155.09] + - [899, 8155.09] - - [1024, 3186, 1, 4096] - - [896, 8092.94] + - [899, 8092.94] - - [1024, 3180, 1, 4096] - - [898, 8096.92] + - [901, 8096.92] - - [4096, 3364, 1, 1024] - - [877, 9317.98] + - [880, 9317.98] - - [4096, 3976, 1, 1024] - - [877, 9654.37] + - [880, 9654.37] - - [4096, 3205, 1, 1024] - - [878, 9538.74] + - [881, 9538.74] - - [4096, 3318, 1, 1024] - - [875, 9838.19] + - [878, 9838.19] - - [1024, 3377, 1, 4096] - - [898, 8445.54] + - [901, 8445.54] - - [1024, 3485, 1, 4096] - - [895, 8368.73] + - [898, 8368.73] - - [4096, 3181, 1, 1024] - - [878, 9458.19] + - [881, 9458.19] - - [4096, 3550, 1, 1024] - - [875, 9783.04] + - [878, 9783.04] - - [1024, 3534, 1, 4096] - - [884, 8684.89] + - [887, 8684.89] - - [1024, 3860, 1, 1024] - - [879, 8923.08] + - [882, 8923.08] - - [160, 160, 400, 64] - - [922, 5797.59] + - [925, 5797.59] - - [4096, 3445, 1, 1024] - - [877, 9511.18] + - [880, 9511.18] - - [1024, 3391, 1, 4096] - - [898, 8541.67] + - [901, 8541.67] - - [1024, 3221, 1, 4096] - - [896, 8055.4] + - [899, 8055.4] - - [4096, 3079, 1, 1024] - - [875, 9180.94] + - [878, 9180.94] - - [4096, 3144, 1, 1024] - - [877, 9351.35] + - [880, 9351.35] - - [1024, 3270, 1, 4096] - - [897, 8367.53] + - [900, 8367.53] - - [1024, 3561, 1, 4096] - - [897, 8426.19] + - [900, 8426.19] - - [1024, 3480, 1, 4096] - - [884, 8464.9] + - [887, 8464.9] - - [4096, 3408, 1, 1024] - - [877, 9419.94] + - [880, 9419.94] - - [1024, 3418, 1, 4096] - - [898, 8480.92] + - [901, 8480.92] - - [4096, 3298, 1, 1024] - - [878, 9788.3] + - [881, 9788.3] - - [1024, 3640, 1, 1024] - - [881, 8435.34] + - [884, 8435.34] - - [1024, 3449, 1, 4096] - - [896, 8590.77] + - [899, 8590.77] - - [1024, 4020, 1, 4096] - - [874, 9168.03] + - [877, 9168.03] - - [4096, 3481, 1, 1024] - - [875, 9627.81] + - [878, 9627.81] - - [4096, 3530, 1, 1024] - - [877, 9734.58] + - [880, 9734.58] - - [1024, 3216, 1, 4096] - - [898, 8014.22] + - [901, 8014.22] - - [1024, 3840, 1, 1024] - - [881, 8908.27] + - [884, 8908.27] - - [1024, 3491, 1, 4096] - - [884, 8410.49] + - [887, 8410.49] - - [1024, 3154, 1, 4096] - - [897, 8095.59] + - [900, 8095.59] - - [4096, 3425, 1, 1024] - - [877, 9474.43] + - [880, 9474.43] - - [1024, 3348, 1, 4096] - - [895, 8202.8] + - [898, 8202.8] - - [1024, 3415, 1, 4096] - - [896, 8597.58] + - [899, 8597.58] - - [1024, 4026, 1, 1024] - - [879, 9278.99] + - [882, 9278.99] - - [1024, 3367, 1, 4096] - - [898, 8335.44] + - [901, 8335.44] - - [1024, 3259, 1, 4096] - - [898, 8285.2] + - [901, 8285.2] - - [1024, 3894, 1, 4096] - - [881, 9040.34] + - [884, 9040.34] - - [4096, 3355, 1, 1024] - - [876, 9291.57] + - [879, 9291.57] - - [4096, 3404, 1, 1024] - - [877, 9410.37] + - [880, 9410.37] - - [1024, 3308, 1, 4096] - - [898, 8336.2] + - [901, 8336.2] - - [4096, 3245, 1, 1024] - - [876, 9641.37] + - [879, 9641.37] - - [1024, 3502, 1, 4096] - - [897, 8375.8] + - [900, 8375.8] - - [33708, 4032, 1, 1024] - - [876, 9988.1] + - [879, 9988.1] - - [8, 8, 7280, 64] - - [913, 339.778] + - [916, 339.778] - - [1024, 3424, 1, 4096] - - [884, 8489.38] + - [887, 8489.38] - - [4096, 3509, 1, 1024] - - [876, 9702.19] + - [879, 9702.19] - - [4096, 3558, 1, 1024] - - [877, 9815.41] + - [880, 9815.41] - - [1024, 3900, 1, 1024] - - [880, 9013.95] + - [883, 9013.95] - - [1024, 2505, 1, 4096] - - [894, 8263.65] + - [897, 8263.65] - - [4096, 3472, 1, 1024] - - [875, 9609.51] + - [878, 9609.51] - - [1024, 3386, 1, 4096] - - [895, 8417.45] + - [898, 8417.45] - - [4096, 3383, 1, 1024] - - [877, 9364.67] + - [880, 9364.67] - - [4096, 3448, 1, 1024] - - [878, 9520.97] + - [881, 9520.97] - - [4096, 4030, 1, 1024] - - [878, 9771.46] + - [881, 9771.46] - - [4096, 3289, 1, 1024] - - [875, 9757.17] + - [878, 9757.17] - - [1024, 3459, 1, 4096] - - [897, 8422.02] + - [900, 8422.02] - - [1024, 2918, 1, 4096] - - [882, 9022.61] + - [885, 9022.61] - - [4096, 3489, 1, 1024] - - [875, 9641.8] + - [878, 9641.8] - - [4096, 3346, 1, 1024] - - [877, 9271.55] + - [880, 9271.55] - - [4096, 3572, 1, 1024] - - [877, 9829.72] + - [880, 9829.72] - - [1024, 3955, 1, 4096] - - [878, 9221.56] + - [881, 9221.56] - - [4096, 3236, 1, 1024] - - [875, 9620.62] + - [878, 9620.62] - - [4096, 3163, 1, 1024] - - [875, 9397.2] + - [878, 9397.2] - - [4096, 3468, 1, 1024] - - [875, 9601.48] + - [878, 9601.48] - - [1024, 3165, 1, 4096] - - [897, 7941.48] + - [900, 7941.48] - - [1024, 3276, 1, 4096] - - [897, 8244.86] + - [900, 8244.86] - - [1024, 3359, 1, 4096] - - [895, 8273.83] + - [898, 8273.83] - - [4096, 3363, 1, 1024] - - [877, 9315.7] + - [880, 9315.7] - - [1024, 3385, 1, 4096] - - [889, 8286.1] + - [892, 8286.1] - - [1024, 3207, 1, 4096] - - [898, 8143.92] + - [901, 8143.92] - - [1024, 3458, 1, 4096] - - [897, 8472.31] + - [900, 8472.31] - - [21, 21, 2976, 64] - - [913, 2083.2] + - [916, 2083.2] - - [4096, 3110, 1, 1024] - - [875, 9260.2] + - [878, 9260.2] - - [4096, 3925, 1, 1024] - - [878, 9526.56] + - [881, 9526.56] - - [1024, 3975, 1, 4096] - - [873, 9133.74] + - [876, 9133.74] - - [4096, 3549, 1, 1024] - - [877, 9793.67] + - [880, 9793.67] - - [4096, 3342, 1, 1024] - - [876, 9264.38] + - [879, 9264.38] - - [1024, 3859, 1, 1024] - - [879, 8933.37] + - [882, 8933.37] - - [1024, 3497, 1, 4096] - - [896, 8526.03] + - [899, 8526.03] - - [4096, 3280, 1, 1024] - - [877, 9733.22] + - [880, 9733.22] - - [1024, 3435, 1, 4096] - - [896, 8489.75] + - [899, 8489.75] - - [1024, 3354, 1, 4096] - - [896, 8248.73] + - [899, 8248.73] - - [4096, 3191, 1, 1024] - - [876, 9475.02] + - [879, 9475.02] - - [4096, 3512, 1, 1024] - - [875, 9701.27] + - [878, 9701.27] - - [1024, 3055, 1, 4096] - - [882, 9264.81] + - [885, 9264.81] - - [4096, 2499, 1, 1024] - - [877, 9573.96] + - [880, 9573.96] - - [1024, 3233, 1, 4096] - - [895, 8101.64] + - [898, 8101.64] - - [4096, 3423, 1, 1024] - - [878, 9463.4] + - [881, 9463.4] - - [1024, 3319, 1, 4096] - - [898, 8413.66] + - [901, 8413.66] - - [4096, 3297, 1, 1024] - - [875, 9782.56] + - [878, 9782.56] - - [4096, 3154, 1, 1024] - - [877, 9381.1] + - [880, 9381.1] - - [1024, 3540, 1, 4096] - - [898, 8507.43] + - [901, 8507.43] - - [1024, 3289, 1, 4096] - - [898, 8233.7] + - [901, 8233.7] - - [4096, 3529, 1, 1024] - - [877, 9741.05] + - [880, 9741.05] - - [4096, 3386, 1, 1024] - - [877, 9372.47] + - [880, 9372.47] - - [4096, 3276, 1, 1024] - - [875, 9713.66] + - [878, 9713.66] - - [1024, 3244, 1, 4096] - - [898, 8146.73] + - [901, 8146.73] - - [1024, 3182, 1, 4096] - - [895, 8115.02] + - [898, 8115.02] - - [4096, 3540, 1, 1024] - - [875, 9768.32] + - [878, 9768.32] - - [1024, 3360, 1, 4096] - - [897, 8353.21] + - [900, 8353.21] - - [1024, 3942, 1, 4096] - - [876, 9143.68] + - [879, 9143.68] - - [4096, 3403, 1, 1024] - - [878, 9412.08] + - [881, 9412.08] - - [4096, 3101, 1, 1024] - - [878, 9239.18] + - [881, 9239.18] - - [4096, 2918, 1, 1024] - - [877, 9373.65] + - [880, 9373.65] - - [1024, 3465, 1, 4096] - - [898, 8288.06] + - [901, 8288.06] - - [33708, 3780, 1, 1024] - - [877, 9971.81] + - [880, 9971.81] - - [4096, 3557, 1, 1024] - - [875, 9814.72] + - [878, 9814.72] - - [4096, 3414, 1, 1024] - - [875, 9436.53] + - [878, 9436.53] - - [1024, 3948, 1, 1024] - - [879, 9073.7] + - [882, 9073.7] - - [4096, 3320, 1, 1024] - - [877, 9834.67] + - [880, 9834.67] - - [4096, 2765, 1, 1024] - - [877, 9666.96] + - [880, 9666.96] - - [1024, 3978, 1, 4096] - - [872, 9109.5] + - [875, 9109.5] - - [4096, 3487, 1, 1024] - - [875, 9643.9] + - [878, 9643.9] - - [4096, 3520, 1, 1024] - - [877, 9727.98] + - [880, 9727.98] - - [1024, 3139, 1, 4096] - - [897, 7940.09] + - [900, 7940.09] - - [1024, 3314, 1, 4096] - - [895, 8293.91] + - [898, 8293.91] - - [4096, 3431, 1, 1024] - - [877, 9482.02] + - [880, 9482.02] - - [123, 122, 528, 64] - - [910, 6325.88] + - [913, 6325.88] - - [1024, 3446, 1, 4096] - - [891, 8468.24] + - [894, 8468.24] - - [1024, 4059, 1, 4096] - - [878, 9370.7] + - [881, 9370.7] - - [99, 102, 624, 64] - - [910, 4624.7] + - [913, 4624.7] - - [4096, 3345, 1, 1024] - - [875, 9271.22] + - [878, 9271.22] - - [4096, 3394, 1, 1024] - - [875, 9398.09] + - [878, 9398.09] - - [1024, 3927, 1, 1024] - - [880, 9041.28] + - [883, 9041.28] - - [4096, 3235, 1, 1024] - - [875, 9619.83] + - [878, 9619.83] - - [1024, 3328, 1, 4096] - - [896, 8405.99] + - [899, 8405.99] - - [33708, 3956, 1, 1024] - - [876, 10100.3] + - [879, 10100.3] - - [4096, 3467, 1, 1024] - - [877, 9586.56] + - [880, 9586.56] - - [1024, 3287, 1, 4096] - - [897, 8273.73] + - [900, 8273.73] - - [4096, 3214, 1, 1024] - - [878, 9557.39] + - [881, 9557.39] - - [4096, 3910, 1, 1024] - - [875, 9490.15] + - [878, 9490.15] - - [1024, 3780, 1, 1024] - - [882, 8705.9] + - [885, 8705.9] - - [1024, 3371, 1, 4096] - - [898, 8248.36] + - [901, 8248.36] - - [4096, 3478, 1, 1024] - - [878, 9619.52] + - [881, 9619.52] - - [1024, 3546, 1, 4096] - - [896, 8456.73] + - [899, 8456.73] - - [1024, 4012, 1, 1024] - - [879, 9253.24] + - [882, 9253.24] - - [4096, 3341, 1, 1024] - - [877, 9260.14] + - [880, 9260.14] - - [4096, 3454, 1, 1024] - - [875, 9533.52] + - [878, 9533.52] - - [4096, 3295, 1, 1024] - - [878, 9772.76] + - [881, 9772.76] - - [4096, 3072, 1, 1024] - - [875, 9887.13] + - [878, 9887.13] - - [1024, 3282, 1, 4096] - - [883, 8112.75] + - [886, 8112.75] - - [33708, 3720, 1, 1024] - - [878, 9818.75] + - [881, 9818.75] - - [1024, 3681, 1, 4096] - - [880, 8639.18] + - [883, 8639.18] - - [1024, 4050, 1, 4096] - - [878, 9291.83] + - [881, 9291.83] - - [4096, 3495, 1, 1024] - - [877, 9660.42] + - [880, 9660.42] - - [4096, 3560, 1, 1024] - - [876, 9813.7] + - [879, 9813.7] - - [4096, 3751, 1, 1024] - - [875, 9684.85] + - [878, 9684.85] - - [1024, 3414, 1, 4096] - - [896, 8555.62] + - [899, 8555.62] - - [33708, 3860, 1, 1024] - - [875, 9856.58] + - [878, 9856.58] - - [1024, 3325, 1, 4096] - - [885, 8261.11] + - [888, 8261.11] - - [4096, 3458, 1, 1024] - - [875, 9570.76] + - [878, 9570.76] - - [4096, 2967, 1, 1024] - - [875, 9544.51] + - [878, 9544.51] - - [1024, 3519, 1, 4096] - - [898, 8413.0] + - [901, 8413.0] - - [4096, 3385, 1, 1024] - - [877, 9367.24] + - [880, 9367.24] - - [4096, 3434, 1, 1024] - - [875, 9488.31] + - [878, 9488.31] - - [1024, 3552, 1, 4096] - - [896, 8456.03] + - [899, 8456.03] - - [4096, 3822, 1, 1024] - - [876, 9849.74] + - [879, 9849.74] - - [1024, 3544, 1, 4096] - - [895, 8494.46] + - [898, 8494.46] - - [4096, 3539, 1, 1024] - - [877, 9762.99] + - [880, 9762.99] - - [4096, 3332, 1, 1024] - - [875, 9232.26] + - [878, 9232.26] - - [1024, 3145, 1, 4096] - - [895, 8098.26] + - [898, 8098.26] - - [1024, 3535, 1, 4096] - - [883, 8592.7] + - [886, 8592.7] - - [1024, 3320, 1, 4096] - - [896, 8419.45] + - [899, 8419.45] - - [33708, 4012, 1, 1024] - - [878, 9940.1] + - [881, 9940.1] - - [4096, 3286, 1, 1024] - - [877, 9747.72] + - [880, 9747.72] - - [1024, 3514, 1, 4096] - - [896, 8653.59] + - [899, 8653.59] - - [93, 93, 688, 64] - - [917, 5005.69] + - [920, 5005.69] - - [1024, 2765, 1, 4096] - - [882, 8636.62] + - [885, 8636.62] - - [1024, 3452, 1, 4096] - - [895, 8445.77] + - [898, 8445.77] - - [4096, 3518, 1, 1024] - - [875, 9722.46] + - [878, 9722.46] - - [1024, 3529, 1, 4096] - - [895, 8444.22] + - [898, 8444.22] - - [4096, 3413, 1, 1024] - - [875, 9436.25] + - [878, 9436.25] - - [33708, 4050, 1, 1024] - - [877, 10026.6] + - [880, 10026.6] - - [1024, 3525, 1, 4096] - - [888, 8488.89] + - [891, 8488.89] - - [4096, 3303, 1, 1024] - - [875, 9790.95] + - [878, 9790.95] - - [1024, 3382, 1, 4096] - - [896, 8483.53] + - [899, 8483.53] - - [1024, 3390, 1, 4096] - - [895, 8552.71] + - [898, 8552.71] - - [1024, 3977, 1, 4096] - - [877, 9053.43] + - [880, 9053.43] - - [1024, 3184, 1, 4096] - - [895, 8008.71] + - [898, 8008.71] - - [4096, 3535, 1, 1024] - - [877, 9760.69] + - [880, 9760.69] - - [4096, 3376, 1, 1024] - - [878, 9341.83] + - [881, 9341.83] - - [4096, 3978, 1, 1024] - - [878, 9642.7] + - [881, 9642.7] - - [1024, 3136, 1, 4096] - - [897, 8085.02] + - [900, 8085.02] - - [1024, 3293, 1, 4096] - - [895, 8300.39] + - [898, 8300.39] - - [4096, 3266, 1, 1024] - - [876, 9691.68] + - [879, 9691.68] - - [1024, 3487, 1, 4096] - - [895, 8383.52] + - [898, 8383.52] - - [1024, 3409, 1, 4096] - - [897, 8493.15] + - [900, 8493.15] - - [4096, 3498, 1, 1024] - - [876, 9672.28] + - [879, 9672.28] - - [1024, 3520, 1, 4096] - - [898, 8488.16] + - [901, 8488.16] - - [1024, 3530, 1, 4096] - - [879, 8409.77] + - [882, 8409.77] - - [4096, 3393, 1, 1024] - - [877, 9395.33] + - [880, 9395.33] - - [4096, 3140, 1, 1024] - - [877, 9338.4] + - [880, 9338.4] - - [1024, 3536, 1, 4096] - - [898, 8642.01] + - [901, 8642.01] - - [1024, 3288, 1, 4096] - - [898, 8229.24] + - [901, 8229.24] - - [1024, 4005, 1, 4096] - - [880, 9270.94] + - [883, 9270.94] - - [1024, 3579, 1, 4096] - - [884, 8844.4] + - [887, 8844.4] - - [4096, 3372, 1, 1024] - - [875, 9339.15] + - [878, 9339.15] - - [1024, 3440, 1, 4096] - - [895, 8466.59] + - [898, 8466.59] - - [4096, 3213, 1, 1024] - - [878, 9558.75] + - [881, 9558.75] - - [123, 123, 528, 64] - - [910, 6333.49] + - [913, 6333.49] - - [100, 100, 624, 64] - - [909, 4584.02] + - [912, 4584.02] - - [1024, 3968, 1, 4096] - - [876, 9237.5] + - [879, 9237.5] - - [4096, 3477, 1, 1024] - - [876, 9618.78] + - [879, 9618.78] - - [4096, 3526, 1, 1024] - - [875, 9735.84] + - [878, 9735.84] - - [1024, 3493, 1, 4096] - - [896, 8355.03] + - [899, 8355.03] - - [1024, 3944, 1, 4096] - - [871, 9065.29] + - [874, 9065.29] - - [4096, 3453, 1, 1024] - - [876, 9533.27] + - [879, 9533.27] - - [1024, 3350, 1, 4096] - - [898, 8448.54] + - [901, 8448.54] - - [4096, 3184, 1, 1024] - - [877, 9447.28] + - [880, 9447.28] - - [1024, 3423, 1, 4096] - - [896, 8465.28] + - [899, 8465.28] - - [4096, 3351, 1, 1024] - - [875, 9281.96] + - [878, 9281.96] - - [4096, 3416, 1, 1024] - - [875, 9446.54] + - [878, 9446.54] - - [1024, 3796, 1, 4096] - - [877, 8820.24] + - [880, 8820.24] - - [4096, 3257, 1, 1024] - - [875, 9671.54] + - [878, 9671.54] - - [4096, 3306, 1, 1024] - - [877, 9795.41] + - [880, 9795.41] - - [33708, 4020, 1, 1024] - - [877, 9961.75] + - [880, 9961.75] - - [19, 19, 3264, 64] - - [907, 1735.99] + - [910, 1735.99] - - [1024, 3426, 1, 4096] - - [895, 8518.51] + - [898, 8518.51] - - [4096, 3457, 1, 1024] - - [875, 9564.46] + - [878, 9564.46] - - [1024, 2935, 1, 4096] - - [880, 9067.69] + - [883, 9067.69] - - [1024, 3046, 1, 4096] - - [880, 9242.87] + - [883, 9242.87] - - [4096, 3433, 1, 1024] - - [877, 9495.55] + - [880, 9495.55] - - [1024, 3256, 1, 4096] - - [898, 8224.13] + - [901, 8224.13] - - [1024, 3531, 1, 4096] - - [895, 8524.09] + - [898, 8524.09] - - [4096, 3180, 1, 1024] - - [875, 9443.43] + - [878, 9443.43] - - [1024, 3388, 1, 4096] - - [897, 8352.72] + - [900, 8352.72] - - [4096, 3444, 1, 1024] - - [878, 9510.93] + - [881, 9510.93] - - [1024, 3501, 1, 4096] - - [885, 8461.02] + - [888, 8461.02] - - [1024, 3266, 1, 4096] - - [883, 8147.34] + - [886, 8147.34] - - [1024, 3267, 1, 4096] - - [898, 8391.39] + - [901, 8391.39] - - [1024, 3461, 1, 4096] - - [882, 8270.19] + - [885, 8270.19] - - [4096, 3870, 1, 1024] - - [877, 9399.59] + - [880, 9399.59] - - [4096, 3517, 1, 1024] - - [875, 9725.33] + - [878, 9725.33] - - [1024, 3566, 1, 4096] - - [898, 8669.66] + - [901, 8669.66] - - [4096, 3574, 1, 1024] - - [875, 9844.53] + - [878, 9844.53] - - [1024, 3876, 1, 1024] - - [880, 8961.64] + - [883, 8961.64] - - [25, 25, 2512, 64] - - [906, 2472.44] + - [909, 2472.44] - - [4096, 3720, 1, 1024] - - [875, 9612.39] + - [878, 9612.39] - - [4096, 3248, 1, 1024] - - [877, 9644.82] + - [880, 9644.82] - - [4096, 4059, 1, 1024] - - [875, 9826.32] + - [878, 9826.32] - - [1024, 3380, 1, 4096] - - [896, 8677.81] + - [899, 8677.81] - - [4096, 3480, 1, 1024] - - [877, 9626.06] + - [880, 9626.06] - - [1024, 3335, 1, 4096] - - [897, 8302.08] + - [900, 8302.08] - - [1024, 3345, 1, 4096] - - [897, 8323.03] + - [900, 8323.03] - - [4096, 3391, 1, 1024] - - [875, 9379.38] + - [878, 9379.38] - - [4096, 3424, 1, 1024] - - [877, 9466.67] + - [880, 9466.67] - - [1024, 3394, 1, 4096] - - [883, 8373.81] + - [886, 8373.81] - - [4096, 3265, 1, 1024] - - [877, 9700.79] + - [880, 9700.79] - - [1024, 3014, 1, 4096] - - [880, 9302.99] + - [883, 9302.99] - - [4096, 3497, 1, 1024] - - [875, 9668.5] + - [878, 9668.5] - - [4096, 3354, 1, 1024] - - [877, 9294.21] + - [880, 9294.21] - - [4096, 3055, 1, 1024] - - [876, 9780.78] + - [879, 9780.78] - - [1024, 3499, 1, 4096] - - [889, 8526.94] + - [892, 8526.94] - - [1024, 3162, 1, 4096] - - [897, 8058.92] + - [900, 8058.92] - - [4096, 3244, 1, 1024] - - [877, 9636.76] + - [880, 9636.76] - - [1024, 3437, 1, 4096] - - [896, 8583.31] + - [899, 8583.31] - - [1024, 3356, 1, 4096] - - [898, 8296.85] + - [901, 8296.85] - - [4096, 3139, 1, 1024] - - [877, 9338.6] + - [880, 9338.6] - - [4096, 3508, 1, 1024] - - [877, 9700.44] + - [880, 9700.44] - - [1024, 3235, 1, 4096] - - [895, 8314.49] + - [898, 8314.49] - - [1024, 3910, 1, 4096] - - [882, 9200.11] + - [885, 9200.11] - - [4096, 3371, 1, 1024] - - [875, 9336.87] + - [878, 9336.87] - - [1024, 3751, 1, 4096] - - [882, 8827.57] + - [885, 8827.57] - - [4096, 3325, 1, 1024] - - [875, 9845.58] + - [878, 9845.58] - - [1024, 3413, 1, 4096] - - [883, 8345.68] + - [886, 8345.68] - - [1024, 3542, 1, 4096] - - [895, 8521.61] + - [898, 8521.61] - - [18, 18, 3440, 64] - - [911, 1578.14] + - [914, 1578.14] - - [101, 102, 624, 64] - - [909, 4705.18] + - [912, 4705.18] - - [33708, 3900, 1, 1024] - - [875, 9950.95] + - [878, 9950.95] - - [4096, 3525, 1, 1024] - - [876, 9744.37] + - [879, 9744.37] - - [4096, 3382, 1, 1024] - - [876, 9358.93] + - [879, 9358.93] - - [102, 100, 624, 64] - - [910, 4671.41] + - [913, 4671.41] - - [15, 15, 4096, 64] - - [914, 1129.07] + - [917, 1129.07] - - [1024, 3339, 1, 4096] - - [884, 8326.27] + - [887, 8326.27] - - [4096, 3288, 1, 1024] - - [877, 9761.38] + - [880, 9761.38] - - [92, 92, 688, 64] - - [917, 4903.77] + - [920, 4903.77] - - [1024, 3141, 1, 4096] - - [895, 7975.54] + - [898, 7975.54] - - [1024, 3168, 1, 4096] - - [895, 8083.64] + - [898, 8083.64] - - [4096, 3488, 1, 1024] - - [877, 9646.67] + - [880, 9646.67] - - [4096, 3046, 1, 1024] - - [876, 9767.48] + - [879, 9767.48] - - [1024, 3362, 1, 4096] - - [898, 8458.05] + - [901, 8458.05] - - [33708, 3942, 1, 1024] - - [876, 10060.3] + - [879, 10060.3] - - [4096, 3399, 1, 1024] - - [877, 9406.47] + - [880, 9406.47] - - [1024, 3720, 1, 1024] - - [879, 8639.06] + - [882, 8639.06] - - [4096, 3563, 1, 1024] - - [875, 9836.45] + - [878, 9836.45] - - [1024, 3273, 1, 4096] - - [898, 8221.52] + - [901, 8221.52] - - [4096, 3162, 1, 1024] - - [877, 9400.09] + - [880, 9400.09] - - [1024, 3467, 1, 4096] - - [896, 8342.32] + - [899, 8342.32] - - [1024, 3130, 1, 4096] - - [897, 7933.78] + - [900, 7933.78] - - [1024, 3405, 1, 4096] - - [904, 8406.49] + - [907, 8406.49] - - [4096, 3362, 1, 1024] - - [875, 9311.94] + - [878, 9311.94] - - [1024, 3960, 1, 1024] - - [879, 9082.16] + - [882, 9082.16] - - [2048, 128, 1, 4096] - - [929, 5986.52] + - [932, 5986.52] - - [1024, 3712, 1, 36548] - - [927, 9456.15] + - [930, 9456.15] - - [1024, 128, 1, 1024] - - [930, 3631.43] + - [933, 3631.43] - - [3072, 128, 1, 4096] - - [926, 6145.5] + - [929, 6145.5] - - [1024, 3712, 1, 1024] - - [928, 8933.88] + - [931, 8933.88] - - [256, 256, 192, 64] - - [933, 8264.64] + - [936, 8264.64] - - [768, 4096, 1, 768] - - [946, 9642.08] + - [949, 9642.08] - - [768, 64, 1, 768] - - [943, 1850.43] + - [946, 1850.43] - - [768, 1280, 1, 768] - - [946, 8738.13] + - [949, 8738.13] - - [30522, 320, 1, 768] - - [947, 9733.59] + - [950, 9733.59] - - [128, 128, 96, 64] - - [936, 5470.83] + - [939, 5470.83] - - [2, 16, 1, 768] - - [939, 2.47742] + - [942, 2.47742] - - [30522, 1280, 1, 768] - - [945, 10127.9] + - [948, 10127.9] - - [30522, 640, 1, 768] - - [946, 9987.61] + - [949, 9987.61] - - [2, 8, 1, 768] - - [938, 0.96] + - [941, 0.96] - - [768, 4096, 1, 3072] - - [948, 9479.41] + - [951, 9479.41] - - [768, 32, 1, 768] - - [942, 880.334] + - [945, 880.334] - - [2, 64, 1, 768] - - [939, 9.99024] + - [942, 9.99024] - - [256, 256, 96, 64] - - [933, 7614.47] + - [936, 7614.47] - - [64, 64, 768, 64] - - [935, 5354.43] + - [938, 5354.43] - - [30522, 160, 1, 768] - - [944, 7740.11] + - [947, 7740.11] - - [768, 320, 1, 768] - - [937, 5423.67] + - [940, 5423.67] - - [128, 128, 384, 64] - - [934, 7179.98] + - [937, 7179.98] - - [768, 16, 1, 768] - - [940, 706.376] + - [943, 706.376] - - [3072, 4096, 1, 768] - - [949, 9961.74] + - [952, 9961.74] - - [2048, 512, 1, 100] - - [951, 5180.71] + - [954, 5180.71] - - [1024, 200, 1, 560] - - [952, 4061.19] + - [955, 4061.19] - - [256, 1280, 1, 1024] - - [959, 4337.44] + - [962, 4337.44] - - [256, 44505, 1, 1024] - - [995, 8597.69] + - [998, 8597.69] - - [10240, 8976, 1, 256] - - [998, 9471.43] + - [1001, 9471.43] - - [256, 7168, 1, 1024] - - [989, 6718.56] + - [992, 6718.56] - - [8448, 8976, 1, 256] - - [981, 9601.31] + - [984, 9601.31] - - [18944, 8976, 1, 256] - - [990, 9666.26] + - [993, 9666.26] - - [256, 19200, 1, 1024] - - [966, 7488.94] + - [969, 7488.94] - - [5632, 8976, 1, 256] - - [978, 9358.39] + - [981, 9358.39] - - [256, 23552, 1, 1024] - - [993, 7980.89] + - [996, 7980.89] - - [256, 6656, 1, 1024] - - [993, 6287.22] + - [996, 6287.22] - - [256, 14336, 1, 1024] - - [988, 7049.26] + - [991, 7049.26] - - [256, 12544, 1, 1024] - - [966, 6728.47] + - [969, 6728.47] - - [2048, 684, 1, 768] - - [983, 8479.18] + - [986, 8479.18] - - [5376, 8976, 1, 256] - - [978, 9519.51] + - [981, 9519.51] - - [256, 5888, 1, 1024] - - [998, 6012.4] + - [1001, 6012.4] - - [19968, 8976, 1, 256] - - [990, 9684.67] + - [993, 9684.67] - - [3840, 8976, 1, 256] - - [975, 9461.89] + - [978, 9461.89] - - [4608, 8976, 1, 256] - - [975, 9305.82] + - [978, 9305.82] - - [256, 684, 1, 1024] - - [1001, 3513.06] + - [1004, 3513.06] - - [256, 22016, 1, 1024] - - [966, 7643.79] + - [969, 7643.79] - - [256, 23296, 1, 1024] - - [995, 8048.12] + - [998, 8048.12] - - [4864, 8976, 1, 256] - - [973, 9545.62] + - [976, 9545.62] - - [256, 7424, 1, 1024] - - [991, 6770.65] + - [994, 6770.65] - - [18176, 8976, 1, 256] - - [998, 9729.47] + - [1001, 9729.47] - - [256, 15104, 1, 1024] - - [987, 7289.08] + - [990, 7289.08] - - [8192, 8976, 1, 256] - - [990, 9395.49] + - [993, 9395.49] - - [256, 16128, 1, 1024] - - [990, 7461.28] + - [993, 7461.28] - - [13312, 8976, 1, 256] - - [998, 9550.97] + - [1001, 9550.97] - - [256, 21504, 1, 1024] - - [995, 7635.93] + - [998, 7635.93] - - [6400, 8976, 1, 256] - - [982, 9560.96] + - [985, 9560.96] - - [256, 8960, 1, 1024] - - [957, 6292.36] + - [960, 6292.36] - - [1792, 8976, 1, 256] - - [972, 9372.18] + - [975, 9372.18] - - [13824, 8976, 1, 256] - - [990, 9585.27] + - [993, 9585.27] - - [11776, 8976, 1, 256] - - [990, 9560.34] + - [993, 9560.34] - - [256, 20992, 1, 1024] - - [988, 7490.65] + - [991, 7490.65] - - [20480, 8976, 1, 256] - - [998, 9610.7] + - [1001, 9610.7] - - [5888, 8976, 1, 256] - - [969, 9565.2] + - [972, 9565.2] - - [256, 10496, 1, 1024] - - [960, 6631.96] + - [963, 6631.96] - - [21248, 8976, 1, 256] - - [990, 9755.77] + - [993, 9755.77] - - [5120, 8976, 1, 256] - - [998, 9244.59] + - [1001, 9244.59] - - [7168, 8976, 1, 256] - - [990, 9388.42] + - [993, 9388.42] - - [2048, 1536, 1, 768] - - [979, 9446.04] + - [982, 9446.04] - - [256, 8192, 1, 1024] - - [984, 6948.89] + - [987, 6948.89] - - [4096, 8976, 1, 256] - - [989, 9115.94] + - [992, 9115.94] - - [3328, 8976, 1, 256] - - [982, 9434.55] + - [985, 9434.55] - - [1280, 8976, 1, 256] - - [980, 9129.8] + - [983, 9129.8] - - [2560, 8976, 1, 256] - - [977, 9199.48] + - [980, 9199.48] - - [3072, 8976, 1, 256] - - [992, 8963.6] + - [995, 8963.6] - - [256, 11776, 1, 1024] - - [970, 6869.8] + - [973, 6869.8] - - [18688, 8976, 1, 256] - - [998, 9726.21] + - [1001, 9726.21] - - [15104, 8976, 1, 256] - - [998, 9715.71] + - [1001, 9715.71] - - [23552, 8976, 1, 256] - - [990, 9648.42] + - [993, 9648.42] - - [6144, 8976, 1, 256] - - [998, 9339.8] + - [1001, 9339.8] - - [12544, 8976, 1, 256] - - [998, 9654.45] + - [1001, 9654.45] - - [256, 11264, 1, 1024] - - [971, 6814.98] + - [974, 6814.98] - - [2048, 114, 1, 512] - - [1002, 4583.5] + - [1005, 4583.5] - - [4352, 8976, 1, 256] - - [982, 9471.4] + - [985, 9471.4] - - [15360, 8976, 1, 256] - - [998, 9583.77] + - [1001, 9583.77] - - [256, 31488, 1, 1024] - - [997, 8438.01] + - [1000, 8438.01] - - [28672, 8976, 1, 256] - - [990, 9688.85] + - [993, 9688.85] - - [256, 18176, 1, 1024] - - [966, 7405.09] + - [969, 7405.09] - - [9728, 8976, 1, 256] - - [998, 9524.15] + - [1001, 9524.15] - - [256, 2816, 1, 1024] - - [962, 5405.66] + - [965, 5405.66] - - [256, 18944, 1, 1024] - - [966, 7503.41] + - [969, 7503.41] - - [256, 3584, 1, 1024] - - [965, 6107.15] + - [968, 6107.15] - - [7936, 8976, 1, 256] - - [978, 9608.31] + - [981, 9608.31] - - [19712, 8976, 1, 256] - - [998, 9736.25] + - [1001, 9736.25] - - [256, 14848, 1, 1024] - - [971, 7163.42] + - [974, 7163.42] - - [256, 8448, 1, 1024] - - [971, 6372.56] + - [974, 6372.56] - - [256, 6400, 1, 1024] - - [985, 6395.71] + - [988, 6395.71] - - [256, 6144, 1, 1024] - - [996, 6490.22] + - [999, 6490.22] - - [9472, 8976, 1, 256] - - [975, 9609.92] + - [978, 9609.92] - - [256, 9984, 1, 1024] - - [958, 6484.75] + - [961, 6484.75] - - [684, 8976, 1, 256] - - [967, 8128.53] + - [970, 8128.53] - - [20992, 8976, 1, 256] - - [990, 9689.65] + - [993, 9689.65] - - [2048, 684, 1, 512] - - [974, 7241.78] + - [977, 7241.78] - - [2048, 114, 1, 768] - - [1000, 4872.46] + - [1003, 4872.46] - - [8960, 8976, 1, 256] - - [973, 9603.35] + - [976, 9603.35] - - [2048, 1536, 1, 512] - - [976, 8830.11] + - [979, 8830.11] - - [256, 3328, 1, 1024] - - [964, 5612.55] + - [967, 5612.55] - - [33536, 8976, 1, 256] - - [990, 9797.71] + - [993, 9797.71] - - [2048, 8976, 1, 256] - - [990, 8975.46] + - [993, 8975.46] - - [10496, 8976, 1, 256] - - [981, 9654.43] + - [984, 9654.43] - - [256, 5376, 1, 1024] - - [999, 5626.34] + - [1002, 5626.34] - - [256, 21248, 1, 1024] - - [968, 7525.45] + - [971, 7525.45] - - [256, 13312, 1, 1024] - - [966, 6767.11] + - [969, 6767.11] - - [16128, 8976, 1, 256] - - [990, 9715.57] + - [993, 9715.57] - - [2304, 8976, 1, 256] - - [963, 9433.83] + - [966, 9433.83] - - [256, 4864, 1, 1024] - - [953, 5743.55] + - [956, 5743.55] - - [17152, 8976, 1, 256] - - [998, 9708.94] + - [1001, 9708.94] - - [15872, 8976, 1, 256] - - [998, 9657.57] + - [1001, 9657.57] - - [9984, 8976, 1, 256] - - [975, 9639.74] + - [978, 9639.74] - - [256, 14592, 1, 1024] - - [987, 7223.92] + - [990, 7223.92] - - [256, 33536, 1, 1024] - - [994, 8147.31] + - [997, 8147.31] - - [11264, 8976, 1, 256] - - [990, 9509.96] + - [993, 9509.96] - - [31488, 8976, 1, 256] - - [998, 9799.31] + - [1001, 9799.31] - - [256, 20480, 1, 1024] - - [971, 7498.2] + - [974, 7498.2] - - [44505, 8976, 1, 256] - - [982, 9804.78] + - [985, 9804.78] - - [13568, 8976, 1, 256] - - [990, 9680.24] + - [993, 9680.24] - - [256, 11520, 1, 1024] - - [970, 6805.26] + - [973, 6805.26] - - [256, 7936, 1, 1024] - - [986, 6971.77] + - [989, 6971.77] - - [2048, 256, 1, 768] - - [956, 7129.13] + - [959, 7129.13] - - [256, 4608, 1, 1024] - - [954, 5462.91] + - [957, 5462.91] - - [256, 2304, 1, 1024] - - [961, 4842.69] + - [964, 4842.69] - - [256, 2560, 1, 1024] - - [962, 5309.25] + - [965, 5309.25] - - [2816, 8976, 1, 256] - - [973, 9409.56] + - [976, 9409.56] - - [1728, 320, 1, 64] - - [1009, 3205.57] + - [1012, 3205.57] - - [1152, 128, 1, 784] - - [1056, 3498.96] + - [1059, 3498.96] - - [576, 96, 1, 5329] - - [1042, 3947.92] + - [1045, 3947.92] - - [864, 96, 1, 1225] - - [1063, 3009.67] + - [1066, 3009.67] - - [256, 128, 1, 784] - - [1053, 1536.49] + - [1056, 1536.49] - - [1440, 320, 1, 196] - - [1006, 4824.62] + - [1009, 4824.62] - - [192, 48, 1, 1225] - - [1084, 820.465] + - [1087, 820.465] - - [2592, 384, 1, 289] - - [1024, 7353.01] + - [1027, 7353.01] - - [192, 80, 36, 10368] - - [1074, 5360.04] + - [1077, 5360.04] - - [896, 192, 1, 289] - - [1041, 3076.56] + - [1044, 3076.56] - - [768, 128, 1, 289] - - [1066, 2351.81] + - [1069, 2351.81] - - [64, 256, 1, 3136] - - [1092, 1809.16] + - [1095, 1809.16] - - [1280, 384, 1, 64] - - [1006, 3171.1] + - [1009, 3171.1] - - [512, 144, 1, 196] - - [1064, 1445.07] + - [1067, 1445.07] - - [1344, 192, 1, 289] - - [1047, 4376.52] + - [1050, 4376.52] - - [288, 64, 1, 21609] - - [1058, 3396.12] + - [1061, 3396.12] - - [400, 32, 1, 784] - - [1085, 922.353] + - [1088, 922.353] - - [288, 32, 1, 21609] - - [1096, 2816.01] + - [1099, 2816.01] - - [1280, 448, 1, 64] - - [1009, 3253.56] + - [1012, 3253.56] - - [3456, 256, 1, 169] - - [1021, 5822.44] + - [1024, 5822.44] - - [2304, 256, 1, 196] - - [1019, 4931.98] + - [1022, 4931.98] - - [384, 192, 1, 1225] - - [1067, 2720.39] + - [1070, 2720.39] - - [832, 48, 1, 49] - - [1062, 344.518] + - [1065, 344.518] - - [832, 192, 1, 49] - - [1044, 1099.36] + - [1047, 1099.36] - - [1280, 192, 1, 64] - - [1045, 2069.56] + - [1048, 2069.56] - - [192, 32, 1, 784] - - [1084, 459.627] + - [1087, 459.627] - - [288, 48, 1, 1225] - - [1091, 1176.0] + - [1094, 1176.0] - - [512, 112, 1, 196] - - [1059, 1277.21] + - [1062, 1277.21] - - [224, 192, 36, 2592] - - [1076, 7369.56] + - [1079, 7369.56] - - [528, 32, 1, 196] - - [1050, 440.374] + - [1053, 440.374] - - [192, 128, 36, 1568] - - [1075, 8245.76] + - [1078, 8245.76] - - [4032, 384, 1, 64] - - [1020, 5898.24] + - [1023, 5898.24] - - [576, 64, 1, 3136] - - [1065, 2671.11] + - [1068, 2671.11] - - [2048, 32, 1, 1001] - - [1067, 2323.0] + - [1070, 2323.0] - - [480, 64, 1, 196] - - [1052, 752.64] + - [1055, 752.64] - - [512, 256, 1, 196] - - [1054, 2528.55] + - [1057, 2528.55] - - [864, 96, 1, 289] - - [1064, 1958.4] + - [1067, 1958.4] - - [896, 128, 1, 289] - - [1067, 2725.73] + - [1070, 2725.73] - - [192, 64, 1, 784] - - [1082, 898.675] + - [1085, 898.675] - - [1200, 64, 1, 1225] - - [1066, 2780.14] + - [1069, 2780.14] - - [1296, 288, 1, 196] - - [1005, 3826.18] + - [1008, 3826.18] - - [576, 96, 1, 5041] - - [1046, 3795.58] + - [1049, 3795.58] - - [1024, 256, 1, 289] - - [1035, 4488.13] + - [1038, 4488.13] - - [1024, 2048, 1, 49] - - [1025, 5077.1] + - [1028, 5077.1] - - [192, 64, 36, 6272] - - [1069, 7514.98] + - [1072, 7514.98] - - [4096, 512, 1, 4096] - - [1031, 10276.0] + - [1034, 10276.0] - - [192, 32, 1, 1225] - - [1085, 556.686] + - [1088, 556.686] - - [1024, 256, 1, 196] - - [1045, 3892.44] + - [1048, 3892.44] - - [1120, 192, 1, 289] - - [1034, 3752.81] + - [1037, 3752.81] - - [400, 48, 1, 196] - - [1059, 480.0] + - [1062, 480.0] - - [1728, 224, 1, 1225] - - [1012, 5575.77] + - [1015, 5575.77] - - [800, 96, 1, 784] - - [1066, 2668.94] + - [1069, 2668.94] - - [1152, 384, 1, 64] - - [1016, 3077.34] + - [1019, 3077.34] - - [4608, 512, 1, 49] - - [1023, 4676.6] + - [1026, 4676.6] - - [1792, 256, 1, 289] - - [1016, 5345.94] + - [1019, 5345.94] - - [864, 128, 1, 784] - - [1066, 3816.2] + - [1069, 3816.2] - - [1728, 384, 1, 169] - - [1018, 5191.68] + - [1021, 5191.68] - - [480, 16, 1, 196] - - [1087, 241.231] + - [1090, 241.231] - - [1568, 256, 1, 289] - - [1006, 4723.41] + - [1009, 4723.41] - - [1152, 448, 1, 64] - - [1012, 3356.72] + - [1015, 3356.72] - - [512, 64, 1, 196] - - [1051, 802.816] + - [1054, 802.816] - - [1344, 224, 1, 289] - - [1006, 3519.63] + - [1009, 3519.63] - - [9216, 512, 1, 4096] - - [1029, 9146.02] + - [1032, 9146.02] - - [27, 32, 1, 22201] - - [1097, 264.356] + - [1100, 264.356] - - [1152, 192, 1, 784] - - [1036, 4904.08] + - [1039, 4904.08] - - [1536, 256, 1, 64] - - [1004, 2578.47] + - [1007, 2578.47] - - [800, 128, 1, 196] - - [1066, 1991.11] + - [1069, 1991.11] - - [800, 64, 1, 196] - - [1061, 1150.83] + - [1064, 1150.83] - - [864, 208, 1, 196] - - [1038, 2684.72] + - [1041, 2684.72] - - [1440, 320, 1, 49] - - [1007, 2313.44] + - [1010, 2313.44] - - [512, 128, 1, 784] - - [1057, 2780.32] + - [1060, 2780.32] - - [720, 192, 1, 5041] - - [1032, 5410.46] + - [1035, 5410.46] - - [256, 64, 1, 784] - - [1089, 1163.5] + - [1092, 1163.5] - - [256, 48, 1, 1225] - - [1084, 1075.2] + - [1087, 1075.2] - - [576, 192, 1, 3136] - - [1032, 4833.01] + - [1035, 4833.01] - - [160, 64, 1, 5329] - - [1086, 1753.5] + - [1089, 1753.5] - - [3456, 384, 1, 289] - - [1026, 7341.75] + - [1029, 7341.75] - - [32, 32, 36, 43808] - - [1080, 1378.03] + - [1083, 1378.03] - - [1344, 512, 1, 64] - - [1005, 3822.93] + - [1008, 3822.93] - - [192, 16, 1, 784] - - [1085, 228.073] + - [1088, 228.073] - - [3456, 384, 1, 169] - - [1022, 6675.02] + - [1025, 6675.02] - - [1152, 256, 1, 196] - - [1015, 3211.26] + - [1018, 3211.26] - - [1728, 192, 1, 1225] - - [1016, 4852.26] + - [1019, 4852.26] - - [2048, 512, 1, 49] - - [1028, 3471.64] + - [1031, 3471.64] - - [576, 96, 1, 1225] - - [1059, 2176.66] + - [1062, 2176.66] - - [512, 2048, 1, 49] - - [1010, 3845.83] + - [1013, 3845.83] - - [1728, 192, 1, 64] - - [1005, 2369.83] + - [1008, 2369.83] - - [832, 256, 1, 49] - - [1035, 1433.6] + - [1038, 1433.6] - - [512, 128, 1, 196] - - [1060, 1459.67] + - [1063, 1459.67] - - [1200, 128, 1, 49] - - [1055, 1069.09] + - [1058, 1069.09] - - [528, 256, 1, 196] - - [1043, 2069.76] + - [1046, 2069.76] - - [256, 512, 1, 784] - - [1066, 4538.89] + - [1069, 4538.89] - - [480, 192, 1, 196] - - [1066, 1792.0] + - [1069, 1792.0] - - [96, 64, 36, 2592] - - [1073, 4845.41] + - [1076, 4845.41] - - [96, 96, 36, 2592] - - [1078, 5111.53] + - [1081, 5111.53] - - [1024, 192, 1, 289] - - [1040, 3431.14] + - [1043, 3431.14] - - [1536, 384, 1, 64] - - [1011, 3166.84] + - [1014, 3166.84] - - [192, 96, 1, 784] - - [1051, 881.14] + - [1054, 881.14] - - [2048, 192, 1, 64] - - [1008, 2330.17] + - [1011, 2330.17] - - [192, 64, 1, 1225] - - [1090, 1100.35] + - [1093, 1100.35] - - [512, 32, 1, 196] - - [1081, 477.867] + - [1084, 477.867] - - [128, 96, 36, 1568] - - [1077, 6649.09] + - [1080, 6649.09] - - [528, 128, 1, 196] - - [1063, 1403.23] + - [1066, 1403.23] - - [128, 512, 1, 784] - - [1053, 2237.81] + - [1056, 2237.81] - - [128, 128, 36, 3136] - - [1070, 6538.77] + - [1073, 6538.77] - - [528, 160, 1, 196] - - [1067, 1642.67] + - [1070, 1642.67] - - [448, 64, 1, 5329] - - [1042, 3264.81] + - [1045, 3264.81] - - [1280, 320, 1, 64] - - [1006, 2776.95] + - [1009, 2776.95] - - [1792, 320, 1, 289] - - [1018, 5204.9] + - [1021, 5204.9] - - [2880, 320, 1, 64] - - [1014, 4336.94] + - [1017, 4336.94] - - [147, 64, 1, 12544] - - [1095, 2430.27] + - [1098, 2430.27] - - [4096, 512, 1, 1001] - - [1030, 9618.99] + - [1033, 9618.99] - - [1536, 32, 1, 1001] - - [1067, 1757.18] + - [1070, 1757.18] - - [512, 160, 1, 196] - - [1063, 1592.89] + - [1066, 1592.89] - - [768, 160, 1, 289] - - [1064, 2757.17] + - [1067, 2757.17] - - [1728, 384, 1, 49] - - [1016, 3102.49] + - [1019, 3102.49] - - [64, 32, 36, 43808] - - [1071, 2626.43] + - [1074, 2626.43] - - [64, 64, 1, 3136] - - [1083, 610.506] + - [1086, 610.506] - - [256, 32, 1, 784] - - [1084, 612.837] + - [1087, 612.837] - - [480, 96, 1, 196] - - [1059, 1055.1] + - [1062, 1055.1] - - [1024, 32, 1, 1001] - - [1049, 1188.43] + - [1052, 1188.43] - - [832, 160, 1, 49] - - [1064, 959.247] + - [1067, 959.247] - - [512, 1024, 1, 196] - - [1007, 4978.7] + - [1010, 4978.7] - - [2048, 64, 1, 1001] - - [1099, 4385.13] + - [1102, 4385.13] - - [2048, 128, 1, 1001] - - [1098, 5764.63] + - [1101, 5764.63] - - [1536, 64, 1, 1001] - - [1100, 3162.03] + - [1103, 3162.03] - - [32, 32, 64, 40000] - - [1134, 2449.4] + - [1137, 2449.4] - - [224, 192, 36, 5184] - - [1129, 7500.12] + - [1132, 7500.12] - - [32, 32, 49, 115200] - - [1135, 1878.28] + - [1138, 1878.28] - - [384, 448, 49, 512] - - [1125, 8945.32] + - [1128, 8945.32] - - [192, 80, 36, 20736] - - [1123, 5412.26] + - [1126, 5412.26] - - [384, 448, 64, 256] - - [1126, 9230.33] + - [1129, 9230.33] - - [96, 64, 64, 18432] - - [1110, 5008.4] + - [1113, 5008.4] - - [224, 192, 64, 4608] - - [1129, 8684.53] + - [1132, 8684.53] - - [96, 96, 49, 3136] - - [1133, 5183.63] + - [1136, 5183.63] - - [224, 192, 64, 2304] - - [1125, 8722.76] + - [1128, 8722.76] - - [64, 32, 49, 57600] - - [1115, 3565.26] + - [1118, 3565.26] - - [384, 448, 36, 256] - - [1124, 8843.41] + - [1127, 8843.41] - - [96, 64, 36, 10368] - - [1117, 4997.46] + - [1120, 4997.46] - - [96, 64, 36, 20736] - - [1119, 5034.77] + - [1122, 5034.77] - - [192, 80, 49, 14400] - - [1115, 4892.22] + - [1118, 4892.22] - - [96, 64, 49, 6272] - - [1136, 5617.04] + - [1139, 5617.04] - - [64, 32, 49, 115200] - - [1114, 3572.57] + - [1117, 3572.57] - - [384, 448, 49, 256] - - [1127, 8858.66] + - [1130, 8858.66] - - [96, 96, 64, 2304] - - [1123, 5379.02] + - [1126, 5379.02] - - [96, 96, 49, 6272] - - [1132, 5235.76] + - [1135, 5235.76] - - [224, 192, 49, 6272] - - [1128, 7629.28] + - [1131, 7629.28] - - [96, 96, 36, 10368] - - [1131, 5281.04] + - [1134, 5281.04] - - [96, 64, 36, 5184] - - [1116, 4945.73] + - [1119, 4945.73] - - [384, 448, 64, 512] - - [1124, 9294.86] + - [1127, 9294.86] - - [224, 192, 49, 3136] - - [1128, 7513.4] + - [1131, 7513.4] - - [384, 448, 36, 512] - - [1130, 8961.38] + - [1133, 8961.38] - - [32, 32, 36, 175232] - - [1138, 1385.5] + - [1141, 1385.5] - - [224, 192, 36, 10368] - - [1129, 7565.73] + - [1132, 7565.73] - - [64, 32, 64, 40000] - - [1114, 4658.85] + - [1117, 4658.85] - - [96, 64, 64, 4608] - - [1113, 5461.6] + - [1116, 5461.6] - - [32, 32, 49, 57600] - - [1135, 1877.01] + - [1138, 1877.01] - - [192, 80, 36, 41472] - - [1121, 5123.59] + - [1124, 5123.59] - - [32, 32, 36, 87616] - - [1134, 1382.32] + - [1137, 1382.32] - - [192, 80, 49, 28800] - - [1114, 4901.95] + - [1117, 4901.95] - - [96, 64, 49, 28800] - - [1111, 4862.5] + - [1114, 4862.5] - - [96, 64, 36, 41472] - - [1118, 5002.26] + - [1121, 5002.26] - - [192, 80, 64, 9216] - - [1109, 5300.55] + - [1112, 5300.55] - - [96, 96, 36, 5184] - - [1131, 5246.24] + - [1134, 5246.24] - - [32, 32, 64, 80000] - - [1139, 2457.11] + - [1142, 2457.11] - - [96, 64, 64, 2304] - - [1137, 6225.74] + - [1140, 6225.74] - - [96, 64, 49, 3136] - - [1136, 5489.02] + - [1139, 5489.02] - - [64, 32, 36, 87616] - - [1114, 2636.29] + - [1117, 2636.29] - - [64, 32, 64, 80000] - - [1114, 4677.64] + - [1117, 4677.64] - - [96, 96, 64, 4608] - - [1120, 5119.63] + - [1123, 5119.63] - - [64, 32, 36, 175232] - - [1115, 2639.83] + - [1118, 2639.83] - - [64, 64, 11, 233600] - - [1170, 1694.16] + - [1173, 1694.16] - - [320, 256, 9, 19584] - - [1146, 7802.45] + - [1149, 7802.45] - - [256, 224, 9, 9792] - - [1161, 7100.97] + - [1164, 7100.97] + - - [128, 128, 11, 3264] + - [1170, 4828.06] - - [256, 256, 9, 4896] - - [1159, 6163.1] + - [1162, 6163.1] - - [320, 256, 9, 4896] - - [1145, 7515.25] + - [1148, 7515.25] - - [224, 192, 9, 19584] - - [1153, 5761.25] + - [1156, 5761.25] - - [192, 192, 11, 3264] - - [1142, 6814.07] + - [1145, 6814.07] - - [64, 64, 11, 116800] - - [1179, 1692.18] + - [1182, 1692.18] - - [64, 64, 9, 172864] - - [1171, 1385.54] + - [1174, 1385.54] - - [192, 128, 11, 6528] - - [1163, 5057.19] + - [1166, 5057.19] - - [64, 64, 11, 58400] - - [1179, 1688.53] + - [1182, 1688.53] - - [192, 160, 9, 19584] - - [1155, 4940.68] + - [1158, 4940.68] - - [128, 128, 9, 9792] - - [1177, 4094.51] + - [1180, 4094.51] + - - [128, 128, 11, 6528] + - [1180, 4780.97] - - [192, 192, 11, 6528] - - [1142, 6918.07] + - [1145, 6918.07] - - [160, 160, 9, 4896] - - [1168, 4545.61] + - [1171, 4545.61] - - [192, 192, 9, 4896] - - [1158, 6156.67] + - [1161, 6156.67] - - [256, 256, 11, 13056] - - [1148, 7526.25] + - [1151, 7526.25] - - [224, 192, 11, 6528] - - [1172, 7333.58] + - [1175, 7333.58] + - - [192, 192, 9, 19584] + - [1176, 5859.95] - - [256, 224, 11, 13056] - - [1146, 6512.15] + - [1149, 6512.15] - - [224, 192, 11, 13056] - - [1175, 6429.18] + - [1178, 6429.18] - - [256, 256, 11, 3264] - - [1143, 7366.03] + - [1146, 7366.03] - - [192, 160, 11, 13056] - - [1155, 5994.41] + - [1158, 5994.41] - - [320, 256, 11, 6528] - - [1152, 8725.7] + - [1155, 8725.7] - - [192, 192, 9, 9792] - - [1155, 5843.92] + - [1158, 5843.92] - - [192, 160, 11, 6528] - - [1164, 6308.46] + - [1167, 6308.46] - - [224, 224, 9, 9792] - - [1165, 6268.31] + - [1168, 6268.31] - - [64, 64, 9, 86432] - - [1170, 1382.91] + - [1173, 1382.91] - - [224, 192, 11, 3264] - - [1173, 7336.37] + - [1176, 7336.37] + - - [128, 128, 9, 19584] + - [1143, 3631.15] - - [224, 224, 11, 6528] - - [1162, 5718.39] + - [1165, 5718.39] - - [160, 160, 11, 13056] - - [1174, 5005.14] + - [1177, 5005.14] - - [160, 160, 9, 19584] - - [1169, 4564.74] + - [1172, 4564.74] - - [192, 128, 9, 19584] - - [1140, 5444.53] + - [1143, 5444.53] - - [192, 160, 9, 9792] - - [1157, 5209.54] + - [1160, 5209.54] - - [224, 224, 9, 19584] - - [1165, 5549.59] + - [1168, 5549.59] - - [192, 192, 11, 13056] - - [1147, 7053.75] + - [1150, 7053.75] - - [192, 128, 9, 4896] - - [1147, 5314.67] + - [1150, 5314.67] - - [320, 256, 9, 9792] - - [1141, 7770.2] + - [1144, 7770.2] - - [320, 256, 11, 13056] - - [1151, 8806.16] + - [1154, 8806.16] + - - [64, 64, 9, 345728] + - [1182, 1386.57] - - [128, 128, 9, 4896] - - [1177, 4041.34] + - [1180, 4041.34] - - [256, 256, 9, 9792] - - [1162, 6138.47] + - [1165, 6138.47] - - [224, 224, 9, 4896] - - [1154, 6936.98] + - [1157, 6936.98] - - [320, 256, 11, 3264] - - [1150, 8630.45] + - [1153, 8630.45] + - - [256, 256, 11, 6528] + - [1145, 7354.98] - - [224, 192, 9, 4896] - - [1174, 6747.03] + - [1177, 6747.03] - - [256, 224, 9, 19584] - - [1149, 5923.69] + - [1152, 5923.69] - - [192, 128, 11, 3264] - - [1160, 4952.72] + - [1163, 4952.72] - - [224, 224, 11, 13056] - - [1162, 5747.58] + - [1165, 5747.58] - - [224, 224, 11, 3264] - - [1165, 5738.78] + - [1168, 5738.78] - - [160, 160, 11, 3264] - - [1163, 5133.73] + - [1166, 5133.73] - - [256, 224, 11, 6528] - - [1156, 6509.68] + - [1159, 6509.68] + - - [128, 128, 11, 13056] + - [1150, 4411.67] - - [192, 160, 9, 4896] - - [1176, 5118.14] + - [1179, 5118.14] - - [256, 224, 11, 3264] - - [1166, 6508.85] + - [1169, 6508.85] - - [160, 160, 9, 9792] - - [1178, 4552.22] + - [1181, 4552.22] - - [192, 160, 11, 3264] - - [1157, 6185.35] + - [1160, 6185.35] - - [256, 256, 9, 19584] - - [1146, 6147.61] + - [1149, 6147.61] + - - [192, 128, 11, 13056] + - [1160, 5112.27] - - [224, 192, 9, 9792] - - [1144, 6657.91] + - [1147, 6657.91] - - [160, 160, 11, 6528] - - [1163, 5254.64] + - [1166, 5254.64] - - [256, 224, 9, 4896] - - [1156, 7023.59] + - [1159, 7023.59] - - [192, 128, 9, 9792] - - [1147, 5400.54] + - [1150, 5400.54] + - - [1024, 6400, 1, 65] + - [1183, 5298.31] + - - [4096, 6400, 1, 256] + - [1184, 9150.88] + - - [4096, 64, 1, 1024] + - [1185, 5482.75] - null diff --git a/scripts/performance/gemm_new_sizes_strided_TN.sh b/scripts/performance/gemm_new_sizes_strided.sh similarity index 100% rename from scripts/performance/gemm_new_sizes_strided_TN.sh rename to scripts/performance/gemm_new_sizes_strided.sh diff --git a/scripts/performance/shakespeare.ssh b/scripts/performance/shakespeare.ssh new file mode 100755 index 000000000..b9b1463c5 --- /dev/null +++ b/scripts/performance/shakespeare.ssh @@ -0,0 +1,11 @@ +#!/bin/bash + +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1024 -n 64 -k 4096 --alpha 1.0 --a_type f32_r --lda 1024 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 1024 -n 64 -k 4096 --alpha 1.0 --a_type f32_r --lda 1024 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 6144 --d_type f32_r --ldd 6144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB N -m 256 -n 6400 -k 4096 --alpha 1.0 --a_type f32_r --lda 256 --b_type f32_r --ldb 6144 --beta 0.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1024 -n 4096 -k 6336 --alpha 1.0 --a_type f32_r --lda 6144 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 1024 -n 4096 -k 64 --alpha 1.0 --a_type f32_r --lda 1024 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA N --transposeB T -m 256 -n 4096 -k 6400 --alpha 1.0 --a_type f32_r --lda 256 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 256 --d_type f32_r --ldd 256 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4096 -n 64 -k 1024 --alpha 1.0 --a_type f32_r --lda 1024 --b_type f32_r --ldb 1024 --beta 1.0 --c_type f32_r --ldc 6144 --d_type f32_r --ldd 6144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4096 -n 64 -k 1024 --alpha 1.0 --a_type f32_r --lda 1024 --b_type f32_r --ldb 6144 --beta 1.0 --c_type f32_r --ldc 6144 --d_type f32_r --ldd 6144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0 +./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 4096 -n 6400 -k 256 --alpha 1.0 --a_type f32_r --lda 256 --b_type f32_r --ldb 256 --beta 1.0 --c_type f32_r --ldc 6144 --d_type f32_r --ldd 6144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0