From a7c2c6d669150c405d353f1c2845a9f3cb36ec44 Mon Sep 17 00:00:00 2001 From: Wen-Chuan Chen Date: Mon, 23 Jun 2025 04:34:29 -0400 Subject: [PATCH 1/5] rm outdated HHS auxh kernels in gridbased gfx1201 --- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1911 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ...jlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 7014 ------------- ...ljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 7273 ------------- ...jlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 8050 --------------- ...ljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 9086 ----------------- 8 files changed, 8550 insertions(+), 31423 deletions(-) create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..5ea3910e819d --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA_gVzqyXY4lVMFx31MqSbB0ud3S5R9EQWi12BfYDWDLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAIVneIeXsqZD4OaZjJp8hvawKFBg-ime18ufUnc8u8cc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAKI8wQM_FheExAvx0QJ-zMY1craZcEag1K1coINKPPkw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 8 + LVCA: 2 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAW-f1a6oJ3DMPUYY6SGvmU2bXZUxM9tH-ICrRLM0ejig= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAuu_CKIJDCq41BZOvU4z4gH484wfIlTBh3jGK8bJ6M-w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserARtOO83QKIFvzwyHltdOqn_fj7DOrstvd0ULWV0Af0fc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA-m9AcsWte2XsapAmRUjnuiJni1_0NG83An71gKU_3Qk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2304 + LdsInitCVgprs: false + LdsNumBytes: 2304 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [2, 0.08] + - - [129, 128, 1, 128, 129, 129, 129, 128] + - [0, 0.07] + - - [128, 129, 1, 128, 128, 128, 128, 129] + - [1, 0.07] + - - [128, 128, 1, 129, 128, 128, 128, 128] + - [3, 0.08] + - - [129, 129, 1, 128, 129, 129, 129, 129] + - [5, 0.07] + - - [129, 128, 1, 129, 129, 129, 129, 128] + - [6, 0.07] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [4, 0.07] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..3ccbd40fa0ac --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1911 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAssQcDAYsuGnrJ0RCDOfoZlcyUNjLyP5y1_KdtcmJf0E= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6528 + LdsInitCVgprs: false + LdsNumBytes: 6528 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlwVP7gyExHTNKoV_9-NqHQcY8x8JDQiBg2yf4PAHPvk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 1 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserABPLzPwkl1b26M47S18lTb0flFRVJhwDGzlzvnE61UQE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6528 + LdsInitCVgprs: false + LdsNumBytes: 6528 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAJwyqJfxSdPQhtNS5fOq2WVNAyT1YsraJu8houvostjU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAQNGo8mAAKe45Vr7xTXHP-SVujPWAE_eB4_O2SOqbvY0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 1 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA2fX9lfk9GAJdIriBwmfehppLN1D9ED3LkNtP7zSIUlc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [3, 0.08] + - - [129, 128, 1, 128, 129, 129, 129, 128] + - [5, 0.07] + - - [128, 129, 1, 128, 128, 128, 128, 128] + - [1, 0.08] + - - [128, 128, 1, 129, 128, 128, 128, 129] + - [0, 0.08] + - - [129, 129, 1, 128, 129, 129, 129, 128] + - [4, 0.07] + - - [129, 128, 1, 129, 129, 129, 129, 129] + - [5, 0.07] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [2, 0.08] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..1941ec3f8558 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAXq3Qh7OtsBZWdD5vOEmSIdhiSav2Ren-TgRqSFfMpZc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2304 + LdsInitCVgprs: false + LdsNumBytes: 2304 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAg3Er7LpiQJHElQtk-5iCTSz-wu8ScqhO65gaV9kLXIg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2560 + LdsInitCVgprs: false + LdsNumBytes: 2560 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA75TZ_95242teuBaf0bfjFy1tkVPH7kWdr2Kcg4JFgJs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2432 + LdsInitCVgprs: false + LdsNumBytes: 2432 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2432 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA0zZWwdGXwF0u_qsM45G41-beHeGkGA1Dzf7Lybt3f7I= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13056 + LdsInitCVgprs: false + LdsNumBytes: 13056 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlMsVS0gVUWkH3AExJlZmmYK6npX79DQmOZOqfQ-reTc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 4 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA7FBspZCHlbfbDKx8oD6wXa6U64Azerl5XZGHYTMC-Gs= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6528 + LdsInitCVgprs: false + LdsNumBytes: 6528 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAwJvHxKN5tg75Sp_5g7s9RMP3wv4WzIQNjssTW1YSxB0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [3, 0.08] + - - [129, 128, 1, 128, 129, 129, 128, 128] + - [2, 0.07] + - - [128, 129, 1, 128, 128, 128, 128, 129] + - [1, 0.08] + - - [128, 128, 1, 129, 128, 128, 129, 128] + - [0, 0.08] + - - [129, 129, 1, 128, 129, 129, 128, 129] + - [5, 0.07] + - - [129, 128, 1, 129, 129, 129, 129, 128] + - [4, 0.08] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [6, 0.07] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..bae1e6c44bf9 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1201 +- gfx1201 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAjS5FvVEezs2lcRsTtVmGR-aPCZ4PyEz4TS9Q5TV3AEU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAxdekMXyoBnaw8oe3vrLNklpTVCB_DbFyo5mWV4gGEVk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlkb4g3hYd2Ug1vQuFNo1sX2ASflzLV2vqjnC-evih3w= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA6qosFeA-HRbtOduZV-BrtS2p4okMa3p3LMFg2sCKRL8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAia-sIZol4cC_o1eSiq9SHyL3Zg22Xxh2FGk60q6KEXU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 5120 + LdsInitCVgprs: false + LdsNumBytes: 5120 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAes6gb3cYte_OgoONp1kpyfxv77n24deIis_dCJB50Ck= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2560 + LdsInitCVgprs: false + LdsNumBytes: 2560 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA7p8-kkOxKWtIwdWAfbGndlz2r4n7dzs4_xZweOgKk34= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 1] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [2, 0.08] + - - [129, 128, 1, 128, 129, 129, 128, 128] + - [0, 0.08] + - - [128, 129, 1, 128, 128, 128, 128, 128] + - [3, 0.08] + - - [128, 128, 1, 129, 128, 128, 129, 129] + - [4, 0.08] + - - [129, 129, 1, 128, 129, 129, 128, 128] + - [1, 0.07] + - - [129, 128, 1, 129, 129, 129, 129, 129] + - [5, 0.08] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [6, 0.07] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index cbf5e9b76067..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7014 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SVW8_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG64_2_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 512] - - [18, 1099.89] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 512] - - [6, 2144.12] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 512] - - [7, 2764.79] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 512] - - [19, 3519.82] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 512] - - [8, 4377.96] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 512] - - [6, 5057.82] - - - [512, 1024, 1, 64, 512, 512, 512, 1024] - - [7, 2087.37] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] - - [6, 3734.7] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 1024] - - [5, 4845.64] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 1024] - - [8, 6096.37] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 1024] - - [5, 7148.67] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 1024] - - [5, 7922.19] - - - [512, 1536, 1, 64, 512, 512, 512, 1536] - - [5, 2895.54] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 1536] - - [24, 4027.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 1536] - - [5, 6702.84] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 1536] - - [2, 7639.17] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 1536] - - [8, 9019.2] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 1536] - - [6, 9848.67] - - - [512, 2048, 1, 64, 512, 512, 512, 2048] - - [5, 3739.91] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 2048] - - [8, 6187.86] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 2048] - - [6, 7769.18] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 2048] - - [5, 9177.28] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 2048] - - [3, 10608.8] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 2048] - - [5, 12018.4] - - - [512, 2560, 1, 64, 512, 512, 512, 2560] - - [6, 4229.73] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 2560] - - [5, 7224.09] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 2560] - - [6, 8998.24] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 2560] - - [2, 10791.5] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 2560] - - [6, 12205.3] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 2560] - - [4, 12526.4] - - - [512, 3072, 1, 64, 512, 512, 512, 3072] - - [22, 3995.76] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 3072] - - [8, 7679.97] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 3072] - - [2, 10270.0] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 3072] - - [5, 11815.3] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 3072] - - [6, 12965.9] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 3072] - - [8, 13879.2] - - - [512, 512, 1, 256, 512, 512, 512, 512] - - [10, 3903.49] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 512] - - [7, 6716.76] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 512] - - [7, 8778.71] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 512] - - [5, 10953.9] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 512] - - [19, 12527.3] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 512] - - [1, 9610.21] - - - [512, 1024, 1, 256, 512, 512, 512, 1024] - - [7, 6923.26] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] - - [5, 11248.1] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 1024] - - [5, 14206.2] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 1024] - - [5, 16510.7] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 1024] - - [5, 18361.3] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 1024] - - [16, 19686.5] - - - [512, 1536, 1, 256, 512, 512, 512, 1536] - - [5, 8780.82] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 1536] - - [5, 14508.7] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 1536] - - [16, 16967.2] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 1536] - - [5, 19927.9] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 1536] - - [5, 21248.4] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 1536] - - [5, 22807.0] - - - [512, 2048, 1, 256, 512, 512, 512, 2048] - - [17, 7492.55] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 2048] - - [5, 16369.8] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 2048] - - [1, 19975.1] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 2048] - - [16, 21887.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 2048] - - [5, 23864.7] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 2048] - - [5, 25187.5] - - - [512, 2560, 1, 256, 512, 512, 512, 2560] - - [5, 12600.2] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 2560] - - [19, 18341.3] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 2560] - - [5, 21642.2] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 2560] - - [5, 23746.5] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 2560] - - [5, 25374.1] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 2560] - - [5, 26900.8] - - - [512, 3072, 1, 256, 512, 512, 512, 3072] - - [16, 14040.0] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 3072] - - [5, 20017.3] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 3072] - - [5, 22886.9] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 3072] - - [5, 25132.2] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 3072] - - [1, 26647.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 3072] - - [19, 27327.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [23, 6688.31] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [16, 11672.1] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [19, 14221.5] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [16, 17068.4] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [19, 18650.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [5, 20663.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [19, 7379.97] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] - - [5, 16821.9] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] - - [5, 20655.0] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 1024] - - [19, 22773.6] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 1024] - - [16, 24229.7] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] - - [19, 25706.9] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [5, 14320.4] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 1536] - - [5, 20400.7] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 1536] - - [5, 23523.4] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 1536] - - [16, 26308.4] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 1536] - - [19, 27021.9] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 1536] - - [5, 29369.5] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [5, 16680.5] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 2048] - - [19, 22463.2] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 2048] - - [16, 25764.7] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] - - [5, 27882.2] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 2048] - - [5, 29294.8] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 2048] - - [16, 30521.8] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [19, 18251.5] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 2560] - - [16, 19232.6] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 2560] - - [16, 27498.9] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 2560] - - [19, 29866.3] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 2560] - - [19, 30820.6] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 2560] - - [5, 32075.9] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [22, 15962.5] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] - - [16, 25657.7] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 3072] - - [16, 28584.3] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 3072] - - [5, 30594.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 3072] - - [1, 30950.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 3072] - - [16, 32420.7] - - - [512, 512, 1, 1024, 512, 512, 512, 512] - - [10, 10387.0] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [19, 17022.2] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 512] - - [5, 20059.7] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 512] - - [19, 22638.7] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 512] - - [20, 23365.2] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 512] - - [5, 26126.8] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [0, 13164.4] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 23349.1] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [16, 26046.3] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [16, 28018.9] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [5, 29343.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [19, 30870.5] - - - [512, 1536, 1, 1024, 512, 512, 512, 1536] - - [19, 19795.4] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [19, 25787.3] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1536] - - [16, 24877.5] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1536] - - [1, 30148.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1536] - - [19, 32041.5] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1536] - - [5, 32160.4] - - - [512, 2048, 1, 1024, 512, 512, 512, 2048] - - [19, 23000.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [19, 27958.6] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 2048] - - [20, 30353.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] - - [16, 32910.4] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 2048] - - [1, 33207.0] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 2048] - - [1, 34410.8] - - - [512, 2560, 1, 1024, 512, 512, 512, 2560] - - [19, 19293.7] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [20, 28586.9] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 2560] - - [5, 31516.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 2560] - - [5, 32695.5] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] - - [16, 33584.5] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 2560] - - [5, 34444.3] - - - [512, 3072, 1, 1024, 512, 512, 512, 3072] - - [5, 26250.5] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [16, 30288.9] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 3072] - - [19, 32653.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 3072] - - [19, 34049.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 3072] - - [16, 34721.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 3072] - - [5, 34981.8] - - - [512, 512, 1, 2048, 512, 512, 512, 512] - - [11, 15358.5] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 512] - - [1, 23129.8] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 512] - - [19, 24494.7] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [5, 27958.6] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 512] - - [16, 28495.7] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 512] - - [19, 29965.7] - - - [512, 1024, 1, 2048, 512, 512, 512, 1024] - - [19, 22924.3] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] - - [16, 27329.3] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] - - [19, 30559.4] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [19, 31352.9] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 1024] - - [16, 32903.2] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] - - [19, 34230.3] - - - [512, 1536, 1, 2048, 512, 512, 512, 1536] - - [19, 24394.2] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 1536] - - [5, 29957.4] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 1536] - - [5, 32132.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [5, 32967.6] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 1536] - - [19, 34108.8] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 1536] - - [5, 35303.1] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [19, 27543.7] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 31852.7] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [5, 33429.5] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [5, 35261.2] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35012.8] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [5, 34927.5] - - - [512, 2560, 1, 2048, 512, 512, 512, 2560] - - [19, 28473.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2560] - - [16, 32443.1] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2560] - - [19, 33703.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [19, 34952.8] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] - - [19, 35427.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2560] - - [19, 35568.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 3072] - - [1, 30643.9] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] - - [5, 33135.0] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 3072] - - [19, 35086.0] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [16, 35286.5] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 3072] - - [5, 35540.8] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 3072] - - [19, 35994.4] - - - [512, 512, 1, 3072, 512, 512, 512, 512] - - [10, 17907.8] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 512] - - [16, 25523.4] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 512] - - [19, 26395.7] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 512] - - [16, 29236.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 512] - - [5, 30260.9] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [19, 31219.3] - - - [512, 1024, 1, 3072, 512, 512, 512, 1024] - - [3, 24676.9] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] - - [19, 29555.2] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 1024] - - [5, 31553.2] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 1024] - - [5, 32571.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 1024] - - [16, 33635.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [1, 34003.7] - - - [512, 1536, 1, 3072, 512, 512, 512, 1536] - - [5, 26965.3] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 1536] - - [16, 31697.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 1536] - - [19, 33471.2] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 1536] - - [19, 34142.1] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 1536] - - [5, 34520.7] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [1, 35005.5] - - - [512, 2048, 1, 3072, 512, 512, 512, 2048] - - [1, 30031.5] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 2048] - - [19, 33287.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 2048] - - [16, 34538.9] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 2048] - - [5, 34890.2] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 2048] - - [19, 35540.9] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [19, 35493.7] - - - [512, 2560, 1, 3072, 512, 512, 512, 2560] - - [19, 30412.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 2560] - - [16, 34182.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 2560] - - [5, 34612.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 2560] - - [5, 34945.9] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 2560] - - [19, 35803.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [1, 35815.1] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [1, 32031.1] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 33794.0] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [1, 34947.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 35799.6] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [1, 35674.7] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 36047.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.37979e-05] - - - [1, 1, 1, 64, 1, 1, 1, 1] - - [15, 0.004] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [13, 0.00423883] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [13, 0.00425858] - - - [64, 64, 1, 1, 64, 64, 64, 64] - - [9, 0.269323] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [12, 0.267914] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [15, 0.266945] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [14, 17.0789] - - - [64, 64, 1, 256, 64, 64, 64, 64] - - [14, 62.8228] - - - [64, 64, 1, 512, 64, 64, 64, 64] - - [21, 117.235] - - - [64, 64, 1, 1024, 64, 64, 64, 64] - - [9, 196.083] - - - [64, 64, 1, 2048, 64, 64, 64, 64] - - [21, 307.602] - - - [64, 64, 1, 4096, 64, 64, 64, 64] - - [9, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index dfa46992dc06..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7273 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS0_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 64] - - [16, 1102.78] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] - - [6, 2110.34] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 64] - - [1, 2716.22] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 64] - - [2, 3824.3] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 64] - - [3, 4454.8] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 64] - - [17, 4254.67] - - - [512, 1024, 1, 64, 512, 512, 512, 64] - - [14, 2122.35] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 64] - - [20, 3584.78] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 64] - - [3, 5123.6] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 64] - - [3, 6276.11] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 64] - - [6, 6942.63] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 64] - - [7, 7808.35] - - - [512, 1536, 1, 64, 512, 512, 512, 64] - - [4, 2789.38] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 64] - - [3, 5102.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 64] - - [2, 6741.75] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 64] - - [3, 8102.81] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 64] - - [2, 9207.63] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 64] - - [3, 10299.8] - - - [512, 2048, 1, 64, 512, 512, 512, 64] - - [7, 3766.25] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 64] - - [2, 6254.18] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 64] - - [3, 7919.07] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 64] - - [2, 9321.0] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 64] - - [3, 10635.7] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 64] - - [3, 12069.0] - - - [512, 2560, 1, 64, 512, 512, 512, 64] - - [3, 4218.03] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 64] - - [2, 7070.04] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 64] - - [6, 9325.68] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 64] - - [3, 11009.6] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 64] - - [3, 12362.0] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 64] - - [7, 13337.8] - - - [512, 3072, 1, 64, 512, 512, 512, 64] - - [6, 5155.08] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 64] - - [7, 7920.79] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 64] - - [6, 10022.4] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 64] - - [3, 12052.6] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 64] - - [3, 13362.6] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 64] - - [2, 14403.3] - - - [512, 512, 1, 256, 512, 512, 512, 256] - - [0, 3747.32] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 256] - - [6, 7243.46] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 256] - - [8, 7307.28] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 256] - - [2, 11509.0] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 256] - - [2, 12441.2] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 256] - - [14, 14099.2] - - - [512, 1024, 1, 256, 512, 512, 512, 256] - - [14, 7025.08] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] - - [6, 11538.9] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 256] - - [6, 14617.0] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] - - [2, 16680.5] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 256] - - [6, 18354.1] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 256] - - [2, 20047.2] - - - [512, 1536, 1, 256, 512, 512, 512, 256] - - [2, 9348.59] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 256] - - [6, 14485.5] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 256] - - [6, 17521.9] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 256] - - [2, 20114.8] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 256] - - [6, 22041.2] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 256] - - [2, 23013.8] - - - [512, 2048, 1, 256, 512, 512, 512, 256] - - [2, 11085.3] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] - - [2, 16850.9] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 256] - - [2, 20295.0] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 256] - - [2, 22703.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 256] - - [2, 24267.1] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 256] - - [6, 26107.7] - - - [512, 2560, 1, 256, 512, 512, 512, 256] - - [2, 12965.9] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] - - [2, 18366.6] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 256] - - [2, 22004.8] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 256] - - [2, 24317.7] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 256] - - [2, 25695.5] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 256] - - [2, 26936.9] - - - [512, 3072, 1, 256, 512, 512, 512, 256] - - [14, 14423.2] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] - - [6, 20027.0] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 256] - - [2, 23488.8] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 256] - - [19, 25372.4] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 256] - - [2, 26893.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 256] - - [6, 28682.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [11, 6659.94] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [14, 11566.3] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [6, 14604.0] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [14, 17389.7] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [6, 19021.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [19, 20904.8] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [14, 11964.0] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] - - [6, 17177.7] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 512] - - [2, 20896.7] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] - - [2, 17762.9] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 512] - - [19, 25167.4] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 512] - - [6, 26659.8] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [2, 14577.3] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 512] - - [25, 15994.3] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 512] - - [14, 23856.2] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 512] - - [14, 26373.2] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 512] - - [6, 27608.5] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 512] - - [2, 29444.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [6, 17566.3] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 512] - - [2, 22682.0] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 512] - - [14, 26297.9] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 512] - - [2, 28275.7] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 512] - - [2, 29643.4] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 512] - - [2, 31025.5] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [14, 18754.9] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 512] - - [2, 24558.2] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 512] - - [6, 27629.3] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 512] - - [6, 29677.8] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 512] - - [6, 31297.4] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 512] - - [6, 32040.3] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [2, 20975.6] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 512] - - [2, 26490.8] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 512] - - [14, 29040.5] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 512] - - [6, 31019.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 512] - - [2, 31626.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 512] - - [2, 33263.9] - - - [512, 512, 1, 1024, 512, 512, 512, 1024] - - [11, 10976.3] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 17862.1] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] - - [24, 17439.6] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 23430.8] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 24764.8] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 26835.5] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [19, 18085.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 23671.6] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 26453.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 28648.2] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 29639.9] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [6, 30994.0] - - - [512, 1536, 1, 1024, 512, 512, 512, 1024] - - [19, 20362.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 26651.0] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 28667.6] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 31269.9] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1024] - - [19, 31786.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 32638.1] - - - [512, 2048, 1, 1024, 512, 512, 512, 1024] - - [19, 23477.0] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 28711.4] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 31338.4] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 32280.9] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 33493.4] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 33769.3] - - - [512, 2560, 1, 1024, 512, 512, 512, 1024] - - [6, 24833.8] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 29926.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 1024] - - [6, 31938.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 33211.2] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 1024] - - [2, 34754.8] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 1024] - - [14, 35387.6] - - - [512, 3072, 1, 1024, 512, 512, 512, 1024] - - [14, 26864.8] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 31058.4] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 32940.1] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 34567.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 34955.1] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 35700.4] - - - [512, 512, 1, 2048, 512, 512, 512, 2048] - - [12, 15600.1] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 23765.9] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 2048] - - [19, 20093.5] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 27942.4] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 28709.5] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 30761.4] - - - [512, 1024, 1, 2048, 512, 512, 512, 2048] - - [6, 23871.8] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 28220.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 30778.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 32535.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 33459.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34043.7] - - - [512, 1536, 1, 2048, 512, 512, 512, 2048] - - [14, 24983.7] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 30956.8] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 32333.8] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 33769.3] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 2048] - - [14, 34579.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34607.7] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [15, 27721.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 32373.6] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34114.2] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 34644.4] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [2, 35409.6] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 36021.8] - - - [512, 2560, 1, 2048, 512, 512, 512, 2048] - - [2, 28845.4] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 33035.2] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34694.1] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35699.2] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35409.2] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 35717.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 2048] - - [2, 30754.1] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 34233.1] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 35127.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [19, 35648.4] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 35884.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 37382.1] - - - [512, 512, 1, 3072, 512, 512, 512, 3072] - - [12, 18127.9] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 26442.7] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 26322.2] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 30968.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 30018.1] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 32424.1] - - - [512, 1024, 1, 3072, 512, 512, 512, 3072] - - [2, 27073.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 30600.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 32482.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 33371.5] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 34360.5] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 34915.3] - - - [512, 1536, 1, 3072, 512, 512, 512, 3072] - - [2, 27553.8] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 32703.2] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 33662.8] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 34536.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 35147.2] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 35200.9] - - - [512, 2048, 1, 3072, 512, 512, 512, 3072] - - [2, 31780.0] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 33546.5] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 34553.2] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 3072] - - [14, 35790.3] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 3072] - - [2, 36033.7] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 36455.0] - - - [512, 2560, 1, 3072, 512, 512, 512, 3072] - - [6, 31027.4] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 3072] - - [6, 35006.1] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 35047.6] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 36074.1] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 36488.0] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [14, 36276.3] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [2, 33344.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 35036.4] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 36277.7] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 35989.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 36792.8] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 37505.2] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.53659e-05] - - - [1, 1, 1, 64, 1, 1, 1, 64] - - [23, 0.00415476] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [5, 0.00401506] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [12, 0.00426141] - - - [64, 64, 1, 1, 64, 64, 64, 1] - - [18, 0.268344] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [22, 0.267642] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [22, 0.266519] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [9, 16.5652] - - - [64, 64, 1, 256, 64, 64, 64, 256] - - [10, 65.3522] - - - [64, 64, 1, 512, 64, 64, 64, 512] - - [22, 116.908] - - - [64, 64, 1, 1024, 64, 64, 64, 1024] - - [22, 202.047] - - - [64, 64, 1, 2048, 64, 64, 64, 2048] - - [22, 318.541] - - - [64, 64, 1, 4096, 64, 64, 64, 4096] - - [21, 435.687] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index b1fd92133b3b..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,8050 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS0_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 512] - - [4, 1083.17] - - - [1024, 512, 1, 64, 1024, 1024, 64, 512] - - [6, 2149.62] - - - [1536, 512, 1, 64, 1536, 1536, 64, 512] - - [3, 2862.44] - - - [2048, 512, 1, 64, 2048, 2048, 64, 512] - - [1, 3757.71] - - - [2560, 512, 1, 64, 2560, 2560, 64, 512] - - [9, 4330.39] - - - [3072, 512, 1, 64, 3072, 3072, 64, 512] - - [1, 5161.69] - - - [512, 1024, 1, 64, 512, 512, 64, 1024] - - [19, 2075.04] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 1024] - - [20, 3608.97] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 1024] - - [23, 4639.72] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 1024] - - [1, 6143.81] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 1024] - - [1, 7283.99] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 1024] - - [6, 7999.47] - - - [512, 1536, 1, 64, 512, 512, 64, 1536] - - [6, 2970.91] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 1536] - - [8, 5152.58] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 1536] - - [6, 6800.96] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 1536] - - [8, 8088.0] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 1536] - - [2, 9133.94] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 1536] - - [6, 10264.6] - - - [512, 2048, 1, 64, 512, 512, 64, 2048] - - [19, 3696.54] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 2048] - - [1, 6298.34] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 2048] - - [19, 7790.22] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 2048] - - [7, 9700.44] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 2048] - - [9, 10984.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 2048] - - [9, 12022.0] - - - [512, 2560, 1, 64, 512, 512, 64, 2560] - - [20, 4391.71] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 2560] - - [6, 7101.62] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 2560] - - [1, 9360.37] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 2560] - - [1, 11006.0] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 2560] - - [8, 12365.6] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 2560] - - [7, 13479.1] - - - [512, 3072, 1, 64, 512, 512, 64, 3072] - - [0, 4434.99] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 3072] - - [6, 8175.2] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 3072] - - [6, 10493.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 3072] - - [7, 11785.7] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 3072] - - [7, 13452.3] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 3072] - - [6, 14621.4] - - - [512, 512, 1, 256, 512, 512, 256, 512] - - [13, 3950.6] - - - [1024, 512, 1, 256, 1024, 1024, 256, 512] - - [21, 7257.17] - - - [1536, 512, 1, 256, 1536, 1536, 256, 512] - - [5, 7936.24] - - - [2048, 512, 1, 256, 2048, 2048, 256, 512] - - [1, 11596.0] - - - [2560, 512, 1, 256, 2560, 2560, 256, 512] - - [21, 12681.2] - - - [3072, 512, 1, 256, 3072, 3072, 256, 512] - - [22, 13454.7] - - - [512, 1024, 1, 256, 512, 512, 256, 1024] - - [27, 5362.92] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] - - [21, 11518.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 1024] - - [6, 14363.5] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 1024] - - [6, 16690.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 1024] - - [1, 18860.1] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 1024] - - [6, 19744.2] - - - [512, 1536, 1, 256, 512, 512, 256, 1536] - - [19, 9025.27] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 1536] - - [1, 14474.8] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 1536] - - [6, 17097.1] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 1536] - - [6, 20107.5] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 1536] - - [6, 21992.8] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 1536] - - [6, 23393.3] - - - [512, 2048, 1, 256, 512, 512, 256, 2048] - - [19, 11514.2] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 2048] - - [6, 16835.1] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 2048] - - [6, 20200.8] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 2048] - - [6, 21977.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 2048] - - [6, 24251.5] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 2048] - - [6, 25706.9] - - - [512, 2560, 1, 256, 512, 512, 256, 2560] - - [6, 8237.86] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 2560] - - [6, 18794.3] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 2560] - - [6, 21764.1] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 2560] - - [6, 23881.9] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 2560] - - [21, 25509.3] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 2560] - - [6, 27707.3] - - - [512, 3072, 1, 256, 512, 512, 256, 3072] - - [8, 14498.3] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 3072] - - [6, 19817.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 3072] - - [6, 22961.1] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 3072] - - [6, 25392.4] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 3072] - - [8, 27016.5] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 3072] - - [8, 28086.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [26, 6889.5] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [1, 12085.4] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [14, 9874.52] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [6, 17180.4] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [6, 18934.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [1, 21194.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [6, 12156.6] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 1024] - - [21, 17796.6] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 1024] - - [1, 21025.2] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 1024] - - [6, 23354.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 1024] - - [21, 24928.3] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 1024] - - [1, 26813.2] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [21, 14516.6] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 1536] - - [6, 21185.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 1536] - - [6, 24108.8] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 1536] - - [6, 26635.3] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 1536] - - [21, 27782.2] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 1536] - - [6, 29862.3] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [19, 12205.1] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 2048] - - [6, 23129.8] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 2048] - - [6, 26490.8] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 2048] - - [6, 28155.0] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 2048] - - [1, 30072.6] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 2048] - - [1, 30680.4] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [6, 18757.3] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 2560] - - [6, 19614.2] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 2560] - - [6, 27576.2] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 2560] - - [1, 30077.5] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 2560] - - [6, 31499.1] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 2560] - - [6, 32458.2] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [21, 21039.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 3072] - - [1, 26199.0] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 3072] - - [6, 29460.8] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 3072] - - [1, 30827.6] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 3072] - - [1, 31993.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 3072] - - [6, 33276.4] - - - [512, 512, 1, 1024, 512, 512, 1024, 512] - - [12, 10971.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [21, 18310.7] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 512] - - [1, 19266.4] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 512] - - [19, 18428.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 512] - - [1, 24497.7] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 512] - - [1, 26869.1] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [21, 18273.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [6, 24337.7] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [19, 26873.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [19, 29022.4] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [19, 30283.1] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [1, 31153.2] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1536] - - [6, 20246.8] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [1, 27449.5] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1536] - - [21, 28920.3] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1536] - - [6, 31424.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1536] - - [1, 31814.4] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1536] - - [1, 33218.0] - - - [512, 2048, 1, 1024, 512, 512, 1024, 2048] - - [21, 24365.6] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [21, 29038.1] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 2048] - - [1, 31145.5] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 2048] - - [19, 32352.7] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 2048] - - [19, 33927.1] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 2048] - - [1, 33613.0] - - - [512, 2560, 1, 1024, 512, 512, 1024, 2560] - - [6, 24540.4] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [22, 29472.2] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 2560] - - [19, 32023.7] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 2560] - - [6, 33480.9] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 2560] - - [1, 33714.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 2560] - - [21, 34741.4] - - - [512, 3072, 1, 1024, 512, 512, 1024, 3072] - - [1, 27182.6] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [6, 31539.8] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 3072] - - [21, 33705.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 3072] - - [19, 34361.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 3072] - - [19, 35487.5] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 3072] - - [8, 35326.3] - - - [512, 512, 1, 2048, 512, 512, 2048, 512] - - [28, 15679.9] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 512] - - [21, 24296.6] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 512] - - [21, 24278.2] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [6, 29739.2] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 512] - - [6, 28423.0] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 512] - - [21, 31795.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 1024] - - [21, 24437.9] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 1024] - - [6, 29380.4] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 1024] - - [21, 31789.3] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [20, 31191.0] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 1024] - - [21, 33912.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 1024] - - [6, 35255.0] - - - [512, 1536, 1, 2048, 512, 512, 2048, 1536] - - [6, 24873.6] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 1536] - - [6, 31882.3] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 1536] - - [21, 31570.7] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [21, 34462.4] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 1536] - - [21, 34598.5] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 1536] - - [6, 35573.9] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [6, 29667.1] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [6, 33209.9] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [6, 34438.3] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35818.1] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [6, 36040.1] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [6, 36782.1] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2560] - - [22, 27572.0] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2560] - - [1, 33165.8] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2560] - - [6, 34165.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [6, 35669.4] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2560] - - [21, 35789.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2560] - - [6, 36272.2] - - - [512, 3072, 1, 2048, 512, 512, 2048, 3072] - - [19, 31784.6] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 3072] - - [6, 35267.6] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 3072] - - [6, 35707.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [21, 35920.7] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 3072] - - [21, 36186.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 3072] - - [6, 36613.0] - - - [512, 512, 1, 3072, 512, 512, 3072, 512] - - [28, 18056.6] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 512] - - [6, 27553.0] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 512] - - [19, 26363.9] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 512] - - [6, 32272.8] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 512] - - [19, 30756.5] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [21, 33875.9] - - - [512, 1024, 1, 3072, 512, 512, 3072, 1024] - - [21, 27610.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 1024] - - [21, 31679.6] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 1024] - - [1, 33794.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 1024] - - [6, 34582.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 1024] - - [6, 34984.0] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [6, 36031.0] - - - [512, 1536, 1, 3072, 512, 512, 3072, 1536] - - [6, 23558.9] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 1536] - - [1, 33291.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 1536] - - [8, 33039.3] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 1536] - - [21, 35348.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 1536] - - [1, 35120.0] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [21, 35699.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 2048] - - [1, 32285.7] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 2048] - - [6, 34666.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 2048] - - [6, 35985.3] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 2048] - - [21, 35977.0] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 2048] - - [6, 37045.4] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [1, 36987.5] - - - [512, 2560, 1, 3072, 512, 512, 3072, 2560] - - [19, 29945.3] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 2560] - - [19, 35250.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 2560] - - [6, 35889.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 2560] - - [21, 36970.0] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 2560] - - [8, 36851.2] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [21, 36784.3] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [6, 33508.8] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [19, 36071.5] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [6, 36754.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [6, 37061.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [6, 36768.6] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 37017.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [15, 6.66533e-05] - - - [1, 1, 1, 64, 1, 1, 64, 1] - - [15, 0.00412252] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [16, 0.00426852] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [25, 0.00407916] - - - [64, 64, 1, 1, 64, 64, 1, 64] - - [17, 0.266164] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [24, 0.271564] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [18, 0.270658] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [11, 17.2707] - - - [64, 64, 1, 256, 64, 64, 256, 64] - - [11, 63.0286] - - - [64, 64, 1, 512, 64, 64, 512, 64] - - [10, 115.644] - - - [64, 64, 1, 1024, 64, 64, 1024, 64] - - [24, 203.124] - - - [64, 64, 1, 2048, 64, 64, 2048, 64] - - [10, 317.817] - - - [64, 64, 1, 4096, 64, 64, 4096, 64] - - [10, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 36ee2e984fca..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/GridBased/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,9086 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1201 -- gfx1201 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 64] - - [26, 1116.36] - - - [1024, 512, 1, 64, 1024, 1024, 64, 64] - - [24, 2109.68] - - - [1536, 512, 1, 64, 1536, 1536, 64, 64] - - [6, 2920.74] - - - [2048, 512, 1, 64, 2048, 2048, 64, 64] - - [24, 3812.35] - - - [2560, 512, 1, 64, 2560, 2560, 64, 64] - - [9, 4280.56] - - - [3072, 512, 1, 64, 3072, 3072, 64, 64] - - [7, 4977.66] - - - [512, 1024, 1, 64, 512, 512, 64, 64] - - [20, 2015.7] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] - - [5, 3695.63] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 64] - - [6, 5176.29] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] - - [26, 5979.18] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 64] - - [7, 7139.54] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 64] - - [7, 8016.99] - - - [512, 1536, 1, 64, 512, 512, 64, 64] - - [4, 2992.1] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 64] - - [8, 4945.87] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 64] - - [23, 6493.29] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 64] - - [3, 8039.4] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 64] - - [3, 9563.3] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 64] - - [7, 10179.8] - - - [512, 2048, 1, 64, 512, 512, 64, 64] - - [9, 3660.25] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 64] - - [3, 6320.59] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 64] - - [3, 8217.08] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 64] - - [3, 9524.73] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 64] - - [3, 11184.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 64] - - [3, 11892.1] - - - [512, 2560, 1, 64, 512, 512, 64, 64] - - [4, 4524.6] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 64] - - [3, 7363.92] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 64] - - [7, 9451.78] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 64] - - [3, 11067.7] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 64] - - [4, 12345.4] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 64] - - [3, 13638.3] - - - [512, 3072, 1, 64, 512, 512, 64, 64] - - [5, 5016.11] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 64] - - [6, 7863.24] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 64] - - [8, 10419.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 64] - - [7, 12251.0] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 64] - - [8, 13673.5] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 64] - - [3, 14721.3] - - - [512, 512, 1, 256, 512, 512, 256, 256] - - [23, 3824.3] - - - [1024, 512, 1, 256, 1024, 1024, 256, 256] - - [21, 6310.04] - - - [1536, 512, 1, 256, 1536, 1536, 256, 256] - - [3, 9379.08] - - - [2048, 512, 1, 256, 2048, 2048, 256, 256] - - [7, 11175.5] - - - [2560, 512, 1, 256, 2560, 2560, 256, 256] - - [3, 12621.6] - - - [3072, 512, 1, 256, 3072, 3072, 256, 256] - - [7, 14480.3] - - - [512, 1024, 1, 256, 512, 512, 256, 256] - - [7, 7320.5] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] - - [7, 11689.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 256] - - [32, 11671.1] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] - - [7, 16930.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 256] - - [3, 18384.2] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 256] - - [3, 20444.7] - - - [512, 1536, 1, 256, 512, 512, 256, 256] - - [3, 9420.77] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 256] - - [7, 14561.4] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 256] - - [7, 17782.7] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 256] - - [7, 20398.1] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 256] - - [7, 22130.6] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 256] - - [7, 23282.6] - - - [512, 2048, 1, 256, 512, 512, 256, 256] - - [23, 11511.4] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 256] - - [7, 17019.7] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 256] - - [7, 20531.0] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 256] - - [7, 22318.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 256] - - [3, 24464.2] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 256] - - [7, 26127.0] - - - [512, 2560, 1, 256, 512, 512, 256, 256] - - [23, 12787.8] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 256] - - [23, 17516.4] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 256] - - [7, 22099.0] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 256] - - [3, 24493.2] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 256] - - [23, 26039.0] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 256] - - [7, 27133.3] - - - [512, 3072, 1, 256, 512, 512, 256, 256] - - [31, 11673.0] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 256] - - [7, 20336.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 256] - - [7, 23677.8] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 256] - - [23, 25872.3] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 256] - - [3, 27085.7] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 256] - - [3, 28148.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [29, 6901.9] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [22, 10023.5] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [3, 14241.6] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [7, 17210.7] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [7, 18333.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [18, 20924.1] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [18, 11163.9] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] - - [3, 17257.7] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 512] - - [3, 21191.4] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] - - [3, 23518.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 512] - - [7, 24712.4] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 512] - - [3, 26455.7] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [7, 14173.7] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 512] - - [3, 21272.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 512] - - [7, 23413.7] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] - - [7, 26629.0] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 512] - - [3, 27606.6] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 512] - - [3, 29315.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [3, 17921.7] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 512] - - [7, 23634.6] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 512] - - [3, 26315.1] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 512] - - [3, 28469.1] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 512] - - [18, 30072.4] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 512] - - [23, 31016.6] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [3, 18699.8] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 512] - - [3, 24838.3] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 512] - - [3, 27568.4] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 512] - - [3, 30484.8] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 512] - - [7, 31541.9] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 512] - - [23, 31976.4] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [25, 20206.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 512] - - [3, 26139.5] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 512] - - [7, 29248.4] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 512] - - [3, 30635.2] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 512] - - [7, 31928.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 512] - - [3, 32401.2] - - - [512, 512, 1, 1024, 512, 512, 1024, 1024] - - [16, 11191.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [30, 15094.6] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 1024] - - [7, 20115.1] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] - - [1, 18401.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 24437.0] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 27036.4] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [23, 18759.2] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 23930.3] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 27311.9] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 29234.0] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 30887.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 31271.5] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1024] - - [3, 15196.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 27468.2] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1024] - - [9, 28538.7] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 31260.8] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 31617.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 33265.9] - - - [512, 2048, 1, 1024, 512, 512, 1024, 1024] - - [7, 24437.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [23, 29042.0] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 31309.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 32573.8] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 1024] - - [7, 33352.6] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 33978.9] - - - [512, 2560, 1, 1024, 512, 512, 1024, 1024] - - [3, 24450.6] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 30538.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 31676.3] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 1024] - - [18, 33474.4] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 1024] - - [25, 34230.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 34591.6] - - - [512, 3072, 1, 1024, 512, 512, 1024, 1024] - - [7, 27123.1] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [25, 30137.2] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 33263.7] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 33801.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 35197.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 35739.8] - - - [512, 512, 1, 2048, 512, 512, 2048, 2048] - - [0, 16070.1] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 24839.9] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 24644.6] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 29724.8] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 2048] - - [7, 28576.3] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 31765.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] - - [30, 19501.7] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 29849.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 31696.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [7, 33441.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 2048] - - [18, 33752.7] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 34739.8] - - - [512, 1536, 1, 2048, 512, 512, 2048, 2048] - - [7, 25223.0] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 32174.0] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 31646.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 34694.8] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34553.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35759.2] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [3, 29652.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33297.5] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 34608.8] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 35139.7] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 35341.3] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35956.2] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2048] - - [3, 28258.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33681.7] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2048] - - [3, 34214.0] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 35566.9] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34975.4] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36457.3] - - - [512, 3072, 1, 2048, 512, 512, 2048, 2048] - - [23, 32028.3] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 35102.8] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 35622.4] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 36506.3] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 36578.4] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36227.1] - - - [512, 512, 1, 3072, 512, 512, 3072, 3072] - - [16, 19097.1] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] - - [7, 22485.2] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 27063.8] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 31945.5] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 3072] - - [25, 30201.7] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 33670.1] - - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] - - [23, 27764.6] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 31934.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 33879.6] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 34652.8] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 3072] - - [7, 35678.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36476.6] - - - [512, 1536, 1, 3072, 512, 512, 3072, 3072] - - [19, 26411.6] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 33941.6] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 33518.4] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 35550.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 3072] - - [19, 35037.3] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36761.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 3072] - - [23, 32025.2] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 34942.8] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 3072] - - [7, 36511.0] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 36512.1] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36582.2] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 36507.9] - - - [512, 2560, 1, 3072, 512, 512, 3072, 3072] - - [9, 30497.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 35525.0] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 35295.7] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 36415.3] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36079.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36791.5] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [23, 33847.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [3, 35738.6] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [23, 36281.0] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [3, 36700.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36778.3] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 37457.0] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [17, 6.65624e-05] - - - [1, 1, 1, 64, 1, 1, 64, 64] - - [21, 0.00424868] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [13, 0.00424305] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [15, 0.00427565] - - - [64, 64, 1, 1, 64, 64, 1, 1] - - [2, 0.269944] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [10, 0.250505] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [28, 0.271195] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [12, 17.4611] - - - [64, 64, 1, 256, 64, 64, 256, 256] - - [11, 64.9052] - - - [64, 64, 1, 512, 64, 64, 512, 512] - - [14, 118.493] - - - [64, 64, 1, 1024, 64, 64, 1024, 1024] - - [11, 203.869] - - - [64, 64, 1, 2048, 64, 64, 2048, 2048] - - [11, 322.713] - - - [64, 64, 1, 4096, 64, 64, 4096, 4096] - - [27, 447.864] -- null -- null -- DeviceEfficiency -- GridBased From 18abe02e351f35e557bd0c6dd304511d8e51ed53 Mon Sep 17 00:00:00 2001 From: wencchen Date: Tue, 24 Jun 2025 02:34:40 +0000 Subject: [PATCH 2/5] rm outdated HHS auxh kernels in gridbased gfx1200 --- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2213 ++++ ...jlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 7014 ------------- ...ljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 7273 ------------- ...jlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 8050 --------------- ...ljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml | 9086 ----------------- 8 files changed, 8852 insertions(+), 31423 deletions(-) create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml create mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml delete mode 100644 projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..5afc7bd9737c --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA5V5zNUlGs0XPWItJNtxIjbzEDM2o5rgUbaeh6pvDflg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAFIkupvkyXPNOy7I-3FF-NlWiRrSRhdTRsYznF_8mqT4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAtkbH-xuD8zpHvBJuf97ihx0uFoWP2EYo0izJhNX4WJc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 32 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAj86OHcyuh-nqlgm6yAY_J2GzH4bz2W9rUztoV9_MkAE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2304 + LdsInitCVgprs: false + LdsNumBytes: 2304 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA0hdwoQwbVKOxsM8a30UVKSctJj0DHVJikyfbBcQ3bpU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 32 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAv6Qm8D3QgMzNokWPYkh__asDLpAF8tQne5Mfh2u3Mu0= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 4608 + LdsInitCVgprs: false + LdsNumBytes: 4608 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAZZe5UWKNNiTUva0Vn_gwbYC-HYHLPFuT_vWg-6rEdGY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 2 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 4608 + LdsInitCVgprs: false + LdsNumBytes: 4608 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 4608 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [2, 0.05] + - - [129, 128, 1, 128, 129, 129, 129, 128] + - [1, 0.05] + - - [128, 129, 1, 128, 128, 128, 128, 129] + - [0, 0.05] + - - [128, 128, 1, 129, 128, 128, 128, 128] + - [5, 0.05] + - - [129, 129, 1, 128, 129, 129, 129, 129] + - [4, 0.06] + - - [129, 128, 1, 129, 129, 129, 129, 128] + - [3, 0.05] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [6, 0.05] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..36b42ed1661e --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA3UxsVRViHwneNhGe8_9RC-Vi_-62zTPt7hDhN5sx8Dk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserALBrxjIVbe9xsypJ5iqusuD5y0ud2PuBb2rSjn3Fw-SM= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAj_fcRH5vrhIwQdBt_3sdYhX6ET7X1VPQJPgk24nhRqI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2560 + LdsInitCVgprs: false + LdsNumBytes: 2560 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA2J76fzRZkTaEu-z1WIzqqf2DeSnWu1jdlOp-H7MRWoc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 64 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13056 + LdsInitCVgprs: false + LdsNumBytes: 13056 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAulbyz5s8NOqfSnEb6wkSAxlObkqj6m-qfyp7kM_diV4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAFtT7EEQ04cTZjRVT3zVfMwAR8dcpsl_M5L8XKOevfhk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 32 + LSPA: 2 + LSPB: 1 + LVCA: 16 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 16 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAi3k3mvRBMsWBJceNQHEK5UQzpno8XAZ9YzFqSDZB8mE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 16 + LSCB: 64 + LSPA: 2 + LSPB: 4 + LVCA: 16 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13056 + LdsInitCVgprs: false + LdsNumBytes: 13056 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 32 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [1, 0.05] + - - [129, 128, 1, 128, 129, 129, 129, 128] + - [0, 0.05] + - - [128, 129, 1, 128, 128, 128, 128, 128] + - [2, 0.05] + - - [128, 128, 1, 129, 128, 128, 128, 129] + - [3, 0.05] + - - [129, 129, 1, 128, 129, 129, 129, 128] + - [6, 0.05] + - - [129, 128, 1, 129, 129, 129, 129, 129] + - [5, 0.05] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [4, 0.05] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..9484935138d3 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAYEduLLrkRwW7nvY8FjyDOKR0X2mjHoXXQBVGyvRU7tY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoFez-_kQ-PuatlsN5JlmSHh4T79Tn3BVV663YZB3q9U= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 4 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA9V1wf0B_vGaplRTk3izVhllX70Yr5PZZ3rsFX1Y-HuU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 4 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA78hixZ9neUtij29lh2vSJarVEjvw_7ZFo2z_RBvK2A4= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2432 + LdsInitCVgprs: false + LdsNumBytes: 2432 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2432 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAEQCWJH6i7g1_rcAtIc39DbIkNegtcFcfxStZnu4ZfSY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 12800 + LdsInitCVgprs: false + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2304 + LdsOffsetMetadata_Blk: 10496 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAQSVUBgP6vRZTU6NN5trNFYaDfuAm3uE7tIdCRN4zTls= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13056 + LdsInitCVgprs: false + LdsNumBytes: 13056 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserADhaRDIyWP7M-cYFjykXduhHTiSyIkrHua64yTGTLGc8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 16 + LSPA: 2 + LSPB: 2 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 2 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13056 + LdsInitCVgprs: false + LdsNumBytes: 13056 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2304 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 8 + NumLoadsB: 32 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [3, 0.05] + - - [129, 128, 1, 128, 129, 129, 128, 128] + - [6, 0.05] + - - [128, 129, 1, 128, 128, 128, 128, 129] + - [0, 0.05] + - - [128, 128, 1, 129, 128, 128, 129, 128] + - [1, 0.05] + - - [129, 129, 1, 128, 129, 129, 128, 129] + - [4, 0.05] + - - [129, 128, 1, 129, 129, 129, 129, 128] + - [2, 0.05] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [5, 0.05] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..25886fcde524 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,2213 @@ +- {MinimumRequiredVersion: 4.33.0} +- gfx1200 +- gfx1200 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAEFxT2akH41gl7rr3zSi_67_N9aj_4Ho8Aswbd7LIBYE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 8 + LVCA: 8 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAbUKAPKpEERts3iuNwJCTMwbDh9PLx70dRrVOarocaG8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA_cU9g7p-BLxgjjO2NEiLz4eWQWXogS-KJyfVigajQCc= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6400 + LdsInitCVgprs: false + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA-tQUyMYVl9AYSwQzDimPbuzM4MsX5ocNjbdvR3Q9kIo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 2560 + LdsInitCVgprs: false + LdsNumBytes: 2560 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 2 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA8lrRX_Ke3UuEGIY_CZIvxXTEwqKIJFVOV_aVcyGIC48= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 6656 + LdsInitCVgprs: false + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 2 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAix5OfNB4tktC22obSHl7a_Ne-mnN05cUd1_kdWLRBAA= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 2 + LVCA: 8 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 13312 + LdsInitCVgprs: false + LdsNumBytes: 13312 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 2560 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 1 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAZlEgPLQuvcw9KFARFIEUcCi_mwVtU1IwhuZ3KNTPSLk= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: '4' + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 64 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: 0 + DirectToVgprB: 0 + DirectToVgprSparseMetadata: false + EdgeType: ShiftPtr + EnableF32XEmulationLds: false + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 2 + ForceDisableShadowInit: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseUniversalArgs: true} + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + LDSTrInst: false + LSCA: 64 + LSCB: 64 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 5120 + LdsInitCVgprs: false + LdsNumBytes: 5120 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2560 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMetadata: 5120 + LdsOffsetMetadata_Blk: 10752 + LdsPadA: 16 + LdsPadB: 16 + LdsPadMetadata: 0 + LocalReadVectorWidth: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 64 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 8 + MIInputPerThreadA: 8 + MIInputPerThreadB: 8 + MIInputPerThreadMetadata: 8 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [1, 1] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 16 + MacroTile1: 16 + MacroTileA: 16 + MacroTileB: 16 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxLDS: 65536 + MaxOccupancy: 40 + MbskPrefetchOpt: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: true + NonDTLTailLoopB: true + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 32 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + PreloadKernArgs: 0 + ProblemType: + Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: true + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 1 + StreamK: 0 + StreamKAtomic: 0 + StreamKXCCMapping: 0 + SubGroup0: 2 + SubGroup1: 16 + SubGroupA: 2 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: true + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMajorLDSMetadata: true + Use64bShadowLimit: 1 + UseDotInstruction: false + UseF32XEmulation: false + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [16, 2, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 1 + enableLDSTrA: false + enableLDSTrB: false + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [128, 128, 1, 128, 128, 128, 128, 128] + - [3, 0.06] + - - [129, 128, 1, 128, 129, 129, 128, 128] + - [0, 0.05] + - - [128, 129, 1, 128, 128, 128, 128, 128] + - [5, 0.05] + - - [128, 128, 1, 129, 128, 128, 129, 129] + - [2, 0.05] + - - [129, 129, 1, 128, 129, 129, 128, 128] + - [1, 0.05] + - - [129, 128, 1, 129, 129, 129, 129, 129] + - [6, 0.05] + - - [129, 129, 1, 129, 129, 129, 129, 129] + - [4, 0.05] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 81e14ddb3e02..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7014 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SVW8_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG64_2_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SVW8_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG64_2_1 - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 64 - LVCB: 16 - LVPA: 2 - LVPB: 8 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13568 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB256_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12544 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA1024_LBSPPB512_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29184 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: false - TransposeB: true - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 512] - - [18, 1099.89] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 512] - - [6, 2144.12] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 512] - - [7, 2764.79] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 512] - - [19, 3519.82] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 512] - - [8, 4377.96] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 512] - - [6, 5057.82] - - - [512, 1024, 1, 64, 512, 512, 512, 1024] - - [7, 2087.37] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 1024] - - [6, 3734.7] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 1024] - - [5, 4845.64] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 1024] - - [8, 6096.37] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 1024] - - [5, 7148.67] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 1024] - - [5, 7922.19] - - - [512, 1536, 1, 64, 512, 512, 512, 1536] - - [5, 2895.54] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 1536] - - [24, 4027.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 1536] - - [5, 6702.84] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 1536] - - [2, 7639.17] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 1536] - - [8, 9019.2] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 1536] - - [6, 9848.67] - - - [512, 2048, 1, 64, 512, 512, 512, 2048] - - [5, 3739.91] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 2048] - - [8, 6187.86] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 2048] - - [6, 7769.18] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 2048] - - [5, 9177.28] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 2048] - - [3, 10608.8] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 2048] - - [5, 12018.4] - - - [512, 2560, 1, 64, 512, 512, 512, 2560] - - [6, 4229.73] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 2560] - - [5, 7224.09] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 2560] - - [6, 8998.24] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 2560] - - [2, 10791.5] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 2560] - - [6, 12205.3] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 2560] - - [4, 12526.4] - - - [512, 3072, 1, 64, 512, 512, 512, 3072] - - [22, 3995.76] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 3072] - - [8, 7679.97] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 3072] - - [2, 10270.0] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 3072] - - [5, 11815.3] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 3072] - - [6, 12965.9] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 3072] - - [8, 13879.2] - - - [512, 512, 1, 256, 512, 512, 512, 512] - - [10, 3903.49] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 512] - - [7, 6716.76] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 512] - - [7, 8778.71] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 512] - - [5, 10953.9] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 512] - - [19, 12527.3] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 512] - - [1, 9610.21] - - - [512, 1024, 1, 256, 512, 512, 512, 1024] - - [7, 6923.26] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 1024] - - [5, 11248.1] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 1024] - - [5, 14206.2] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 1024] - - [5, 16510.7] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 1024] - - [5, 18361.3] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 1024] - - [16, 19686.5] - - - [512, 1536, 1, 256, 512, 512, 512, 1536] - - [5, 8780.82] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 1536] - - [5, 14508.7] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 1536] - - [16, 16967.2] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 1536] - - [5, 19927.9] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 1536] - - [5, 21248.4] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 1536] - - [5, 22807.0] - - - [512, 2048, 1, 256, 512, 512, 512, 2048] - - [17, 7492.55] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 2048] - - [5, 16369.8] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 2048] - - [1, 19975.1] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 2048] - - [16, 21887.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 2048] - - [5, 23864.7] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 2048] - - [5, 25187.5] - - - [512, 2560, 1, 256, 512, 512, 512, 2560] - - [5, 12600.2] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 2560] - - [19, 18341.3] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 2560] - - [5, 21642.2] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 2560] - - [5, 23746.5] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 2560] - - [5, 25374.1] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 2560] - - [5, 26900.8] - - - [512, 3072, 1, 256, 512, 512, 512, 3072] - - [16, 14040.0] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 3072] - - [5, 20017.3] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 3072] - - [5, 22886.9] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 3072] - - [5, 25132.2] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 3072] - - [1, 26647.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 3072] - - [19, 27327.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [23, 6688.31] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [16, 11672.1] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [19, 14221.5] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [16, 17068.4] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [19, 18650.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [5, 20663.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [19, 7379.97] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 1024] - - [5, 16821.9] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 1024] - - [5, 20655.0] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 1024] - - [19, 22773.6] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 1024] - - [16, 24229.7] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 1024] - - [19, 25706.9] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [5, 14320.4] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 1536] - - [5, 20400.7] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 1536] - - [5, 23523.4] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 1536] - - [16, 26308.4] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 1536] - - [19, 27021.9] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 1536] - - [5, 29369.5] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [5, 16680.5] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 2048] - - [19, 22463.2] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 2048] - - [16, 25764.7] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 2048] - - [5, 27882.2] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 2048] - - [5, 29294.8] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 2048] - - [16, 30521.8] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [19, 18251.5] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 2560] - - [16, 19232.6] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 2560] - - [16, 27498.9] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 2560] - - [19, 29866.3] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 2560] - - [19, 30820.6] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 2560] - - [5, 32075.9] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [22, 15962.5] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 3072] - - [16, 25657.7] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 3072] - - [16, 28584.3] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 3072] - - [5, 30594.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 3072] - - [1, 30950.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 3072] - - [16, 32420.7] - - - [512, 512, 1, 1024, 512, 512, 512, 512] - - [10, 10387.0] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [19, 17022.2] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 512] - - [5, 20059.7] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 512] - - [19, 22638.7] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 512] - - [20, 23365.2] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 512] - - [5, 26126.8] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [0, 13164.4] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 23349.1] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [16, 26046.3] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [16, 28018.9] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [5, 29343.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [19, 30870.5] - - - [512, 1536, 1, 1024, 512, 512, 512, 1536] - - [19, 19795.4] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [19, 25787.3] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1536] - - [16, 24877.5] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1536] - - [1, 30148.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1536] - - [19, 32041.5] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1536] - - [5, 32160.4] - - - [512, 2048, 1, 1024, 512, 512, 512, 2048] - - [19, 23000.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [19, 27958.6] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 2048] - - [20, 30353.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 2048] - - [16, 32910.4] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 2048] - - [1, 33207.0] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 2048] - - [1, 34410.8] - - - [512, 2560, 1, 1024, 512, 512, 512, 2560] - - [19, 19293.7] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [20, 28586.9] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 2560] - - [5, 31516.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 2560] - - [5, 32695.5] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 2560] - - [16, 33584.5] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 2560] - - [5, 34444.3] - - - [512, 3072, 1, 1024, 512, 512, 512, 3072] - - [5, 26250.5] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [16, 30288.9] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 3072] - - [19, 32653.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 3072] - - [19, 34049.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 3072] - - [16, 34721.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 3072] - - [5, 34981.8] - - - [512, 512, 1, 2048, 512, 512, 512, 512] - - [11, 15358.5] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 512] - - [1, 23129.8] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 512] - - [19, 24494.7] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [5, 27958.6] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 512] - - [16, 28495.7] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 512] - - [19, 29965.7] - - - [512, 1024, 1, 2048, 512, 512, 512, 1024] - - [19, 22924.3] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 1024] - - [16, 27329.3] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 1024] - - [19, 30559.4] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [19, 31352.9] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 1024] - - [16, 32903.2] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 1024] - - [19, 34230.3] - - - [512, 1536, 1, 2048, 512, 512, 512, 1536] - - [19, 24394.2] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 1536] - - [5, 29957.4] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 1536] - - [5, 32132.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [5, 32967.6] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 1536] - - [19, 34108.8] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 1536] - - [5, 35303.1] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [19, 27543.7] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 31852.7] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [5, 33429.5] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [5, 35261.2] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35012.8] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [5, 34927.5] - - - [512, 2560, 1, 2048, 512, 512, 512, 2560] - - [19, 28473.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2560] - - [16, 32443.1] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2560] - - [19, 33703.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [19, 34952.8] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2560] - - [19, 35427.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2560] - - [19, 35568.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 3072] - - [1, 30643.9] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 3072] - - [5, 33135.0] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 3072] - - [19, 35086.0] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [16, 35286.5] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 3072] - - [5, 35540.8] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 3072] - - [19, 35994.4] - - - [512, 512, 1, 3072, 512, 512, 512, 512] - - [10, 17907.8] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 512] - - [16, 25523.4] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 512] - - [19, 26395.7] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 512] - - [16, 29236.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 512] - - [5, 30260.9] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [19, 31219.3] - - - [512, 1024, 1, 3072, 512, 512, 512, 1024] - - [3, 24676.9] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 1024] - - [19, 29555.2] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 1024] - - [5, 31553.2] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 1024] - - [5, 32571.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 1024] - - [16, 33635.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [1, 34003.7] - - - [512, 1536, 1, 3072, 512, 512, 512, 1536] - - [5, 26965.3] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 1536] - - [16, 31697.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 1536] - - [19, 33471.2] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 1536] - - [19, 34142.1] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 1536] - - [5, 34520.7] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [1, 35005.5] - - - [512, 2048, 1, 3072, 512, 512, 512, 2048] - - [1, 30031.5] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 2048] - - [19, 33287.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 2048] - - [16, 34538.9] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 2048] - - [5, 34890.2] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 2048] - - [19, 35540.9] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [19, 35493.7] - - - [512, 2560, 1, 3072, 512, 512, 512, 2560] - - [19, 30412.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 2560] - - [16, 34182.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 2560] - - [5, 34612.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 2560] - - [5, 34945.9] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 2560] - - [19, 35803.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [1, 35815.1] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [1, 32031.1] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 33794.0] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [1, 34947.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 35799.6] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [1, 35674.7] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 36047.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.37979e-05] - - - [1, 1, 1, 64, 1, 1, 1, 1] - - [15, 0.004] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [13, 0.00423883] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [13, 0.00425858] - - - [64, 64, 1, 1, 64, 64, 64, 64] - - [9, 0.269323] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [12, 0.267914] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [15, 0.266945] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [14, 17.0789] - - - [64, 64, 1, 256, 64, 64, 64, 64] - - [14, 62.8228] - - - [64, 64, 1, 512, 64, 64, 64, 64] - - [21, 117.235] - - - [64, 64, 1, 1024, 64, 64, 64, 64] - - [9, 196.083] - - - [64, 64, 1, 2048, 64, 64, 64, 64] - - [21, 307.602] - - - [64, 64, 1, 4096, 64, 64, 64, 64] - - [9, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index d136ffeb70f5..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,7273 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS0_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25088 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB512_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 4224 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 4224 - LdsOffsetB_Blk: 12416 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4224 - LdsOffsetMetadata_Blk: 12416 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 16 - LVCB: 32 - LVPA: 8 - LVPB: 4 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 9344 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 9344 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA256_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB128_MIWT1_1_NLCA1_PGR2_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 1024 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 8448 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 8448 - LdsOffsetB_Blk: 24832 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 8448 - LdsOffsetMetadata_Blk: 24832 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA1024_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 4352 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 4352 - LdsOffsetB_Blk: 20736 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4352 - LdsOffsetMetadata_Blk: 20736 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB128_MIWT2_1_NLCA1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 512, 64] - - [16, 1102.78] - - - [1024, 512, 1, 64, 1024, 1024, 1024, 64] - - [6, 2110.34] - - - [1536, 512, 1, 64, 1536, 1536, 1536, 64] - - [1, 2716.22] - - - [2048, 512, 1, 64, 2048, 2048, 2048, 64] - - [2, 3824.3] - - - [2560, 512, 1, 64, 2560, 2560, 2560, 64] - - [3, 4454.8] - - - [3072, 512, 1, 64, 3072, 3072, 3072, 64] - - [17, 4254.67] - - - [512, 1024, 1, 64, 512, 512, 512, 64] - - [14, 2122.35] - - - [1024, 1024, 1, 64, 1024, 1024, 1024, 64] - - [20, 3584.78] - - - [1536, 1024, 1, 64, 1536, 1536, 1536, 64] - - [3, 5123.6] - - - [2048, 1024, 1, 64, 2048, 2048, 2048, 64] - - [3, 6276.11] - - - [2560, 1024, 1, 64, 2560, 2560, 2560, 64] - - [6, 6942.63] - - - [3072, 1024, 1, 64, 3072, 3072, 3072, 64] - - [7, 7808.35] - - - [512, 1536, 1, 64, 512, 512, 512, 64] - - [4, 2789.38] - - - [1024, 1536, 1, 64, 1024, 1024, 1024, 64] - - [3, 5102.82] - - - [1536, 1536, 1, 64, 1536, 1536, 1536, 64] - - [2, 6741.75] - - - [2048, 1536, 1, 64, 2048, 2048, 2048, 64] - - [3, 8102.81] - - - [2560, 1536, 1, 64, 2560, 2560, 2560, 64] - - [2, 9207.63] - - - [3072, 1536, 1, 64, 3072, 3072, 3072, 64] - - [3, 10299.8] - - - [512, 2048, 1, 64, 512, 512, 512, 64] - - [7, 3766.25] - - - [1024, 2048, 1, 64, 1024, 1024, 1024, 64] - - [2, 6254.18] - - - [1536, 2048, 1, 64, 1536, 1536, 1536, 64] - - [3, 7919.07] - - - [2048, 2048, 1, 64, 2048, 2048, 2048, 64] - - [2, 9321.0] - - - [2560, 2048, 1, 64, 2560, 2560, 2560, 64] - - [3, 10635.7] - - - [3072, 2048, 1, 64, 3072, 3072, 3072, 64] - - [3, 12069.0] - - - [512, 2560, 1, 64, 512, 512, 512, 64] - - [3, 4218.03] - - - [1024, 2560, 1, 64, 1024, 1024, 1024, 64] - - [2, 7070.04] - - - [1536, 2560, 1, 64, 1536, 1536, 1536, 64] - - [6, 9325.68] - - - [2048, 2560, 1, 64, 2048, 2048, 2048, 64] - - [3, 11009.6] - - - [2560, 2560, 1, 64, 2560, 2560, 2560, 64] - - [3, 12362.0] - - - [3072, 2560, 1, 64, 3072, 3072, 3072, 64] - - [7, 13337.8] - - - [512, 3072, 1, 64, 512, 512, 512, 64] - - [6, 5155.08] - - - [1024, 3072, 1, 64, 1024, 1024, 1024, 64] - - [7, 7920.79] - - - [1536, 3072, 1, 64, 1536, 1536, 1536, 64] - - [6, 10022.4] - - - [2048, 3072, 1, 64, 2048, 2048, 2048, 64] - - [3, 12052.6] - - - [2560, 3072, 1, 64, 2560, 2560, 2560, 64] - - [3, 13362.6] - - - [3072, 3072, 1, 64, 3072, 3072, 3072, 64] - - [2, 14403.3] - - - [512, 512, 1, 256, 512, 512, 512, 256] - - [0, 3747.32] - - - [1024, 512, 1, 256, 1024, 1024, 1024, 256] - - [6, 7243.46] - - - [1536, 512, 1, 256, 1536, 1536, 1536, 256] - - [8, 7307.28] - - - [2048, 512, 1, 256, 2048, 2048, 2048, 256] - - [2, 11509.0] - - - [2560, 512, 1, 256, 2560, 2560, 2560, 256] - - [2, 12441.2] - - - [3072, 512, 1, 256, 3072, 3072, 3072, 256] - - [14, 14099.2] - - - [512, 1024, 1, 256, 512, 512, 512, 256] - - [14, 7025.08] - - - [1024, 1024, 1, 256, 1024, 1024, 1024, 256] - - [6, 11538.9] - - - [1536, 1024, 1, 256, 1536, 1536, 1536, 256] - - [6, 14617.0] - - - [2048, 1024, 1, 256, 2048, 2048, 2048, 256] - - [2, 16680.5] - - - [2560, 1024, 1, 256, 2560, 2560, 2560, 256] - - [6, 18354.1] - - - [3072, 1024, 1, 256, 3072, 3072, 3072, 256] - - [2, 20047.2] - - - [512, 1536, 1, 256, 512, 512, 512, 256] - - [2, 9348.59] - - - [1024, 1536, 1, 256, 1024, 1024, 1024, 256] - - [6, 14485.5] - - - [1536, 1536, 1, 256, 1536, 1536, 1536, 256] - - [6, 17521.9] - - - [2048, 1536, 1, 256, 2048, 2048, 2048, 256] - - [2, 20114.8] - - - [2560, 1536, 1, 256, 2560, 2560, 2560, 256] - - [6, 22041.2] - - - [3072, 1536, 1, 256, 3072, 3072, 3072, 256] - - [2, 23013.8] - - - [512, 2048, 1, 256, 512, 512, 512, 256] - - [2, 11085.3] - - - [1024, 2048, 1, 256, 1024, 1024, 1024, 256] - - [2, 16850.9] - - - [1536, 2048, 1, 256, 1536, 1536, 1536, 256] - - [2, 20295.0] - - - [2048, 2048, 1, 256, 2048, 2048, 2048, 256] - - [2, 22703.6] - - - [2560, 2048, 1, 256, 2560, 2560, 2560, 256] - - [2, 24267.1] - - - [3072, 2048, 1, 256, 3072, 3072, 3072, 256] - - [6, 26107.7] - - - [512, 2560, 1, 256, 512, 512, 512, 256] - - [2, 12965.9] - - - [1024, 2560, 1, 256, 1024, 1024, 1024, 256] - - [2, 18366.6] - - - [1536, 2560, 1, 256, 1536, 1536, 1536, 256] - - [2, 22004.8] - - - [2048, 2560, 1, 256, 2048, 2048, 2048, 256] - - [2, 24317.7] - - - [2560, 2560, 1, 256, 2560, 2560, 2560, 256] - - [2, 25695.5] - - - [3072, 2560, 1, 256, 3072, 3072, 3072, 256] - - [2, 26936.9] - - - [512, 3072, 1, 256, 512, 512, 512, 256] - - [14, 14423.2] - - - [1024, 3072, 1, 256, 1024, 1024, 1024, 256] - - [6, 20027.0] - - - [1536, 3072, 1, 256, 1536, 1536, 1536, 256] - - [2, 23488.8] - - - [2048, 3072, 1, 256, 2048, 2048, 2048, 256] - - [19, 25372.4] - - - [2560, 3072, 1, 256, 2560, 2560, 2560, 256] - - [2, 26893.4] - - - [3072, 3072, 1, 256, 3072, 3072, 3072, 256] - - [6, 28682.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [11, 6659.94] - - - [1024, 512, 1, 512, 1024, 1024, 1024, 512] - - [14, 11566.3] - - - [1536, 512, 1, 512, 1536, 1536, 1536, 512] - - [6, 14604.0] - - - [2048, 512, 1, 512, 2048, 2048, 2048, 512] - - [14, 17389.7] - - - [2560, 512, 1, 512, 2560, 2560, 2560, 512] - - [6, 19021.2] - - - [3072, 512, 1, 512, 3072, 3072, 3072, 512] - - [19, 20904.8] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [14, 11964.0] - - - [1024, 1024, 1, 512, 1024, 1024, 1024, 512] - - [6, 17177.7] - - - [1536, 1024, 1, 512, 1536, 1536, 1536, 512] - - [2, 20896.7] - - - [2048, 1024, 1, 512, 2048, 2048, 2048, 512] - - [2, 17762.9] - - - [2560, 1024, 1, 512, 2560, 2560, 2560, 512] - - [19, 25167.4] - - - [3072, 1024, 1, 512, 3072, 3072, 3072, 512] - - [6, 26659.8] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [2, 14577.3] - - - [1024, 1536, 1, 512, 1024, 1024, 1024, 512] - - [25, 15994.3] - - - [1536, 1536, 1, 512, 1536, 1536, 1536, 512] - - [14, 23856.2] - - - [2048, 1536, 1, 512, 2048, 2048, 2048, 512] - - [14, 26373.2] - - - [2560, 1536, 1, 512, 2560, 2560, 2560, 512] - - [6, 27608.5] - - - [3072, 1536, 1, 512, 3072, 3072, 3072, 512] - - [2, 29444.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [6, 17566.3] - - - [1024, 2048, 1, 512, 1024, 1024, 1024, 512] - - [2, 22682.0] - - - [1536, 2048, 1, 512, 1536, 1536, 1536, 512] - - [14, 26297.9] - - - [2048, 2048, 1, 512, 2048, 2048, 2048, 512] - - [2, 28275.7] - - - [2560, 2048, 1, 512, 2560, 2560, 2560, 512] - - [2, 29643.4] - - - [3072, 2048, 1, 512, 3072, 3072, 3072, 512] - - [2, 31025.5] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [14, 18754.9] - - - [1024, 2560, 1, 512, 1024, 1024, 1024, 512] - - [2, 24558.2] - - - [1536, 2560, 1, 512, 1536, 1536, 1536, 512] - - [6, 27629.3] - - - [2048, 2560, 1, 512, 2048, 2048, 2048, 512] - - [6, 29677.8] - - - [2560, 2560, 1, 512, 2560, 2560, 2560, 512] - - [6, 31297.4] - - - [3072, 2560, 1, 512, 3072, 3072, 3072, 512] - - [6, 32040.3] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [2, 20975.6] - - - [1024, 3072, 1, 512, 1024, 1024, 1024, 512] - - [2, 26490.8] - - - [1536, 3072, 1, 512, 1536, 1536, 1536, 512] - - [14, 29040.5] - - - [2048, 3072, 1, 512, 2048, 2048, 2048, 512] - - [6, 31019.4] - - - [2560, 3072, 1, 512, 2560, 2560, 2560, 512] - - [2, 31626.8] - - - [3072, 3072, 1, 512, 3072, 3072, 3072, 512] - - [2, 33263.9] - - - [512, 512, 1, 1024, 512, 512, 512, 1024] - - [11, 10976.3] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 17862.1] - - - [1536, 512, 1, 1024, 1536, 1536, 1536, 1024] - - [24, 17439.6] - - - [2048, 512, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 23430.8] - - - [2560, 512, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 24764.8] - - - [3072, 512, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 26835.5] - - - [512, 1024, 1, 1024, 512, 512, 512, 1024] - - [19, 18085.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 23671.6] - - - [1536, 1024, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 26453.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 28648.2] - - - [2560, 1024, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 29639.9] - - - [3072, 1024, 1, 1024, 3072, 3072, 3072, 1024] - - [6, 30994.0] - - - [512, 1536, 1, 1024, 512, 512, 512, 1024] - - [19, 20362.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [19, 26651.0] - - - [1536, 1536, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 28667.6] - - - [2048, 1536, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 31269.9] - - - [2560, 1536, 1, 1024, 2560, 2560, 2560, 1024] - - [19, 31786.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 32638.1] - - - [512, 2048, 1, 1024, 512, 512, 512, 1024] - - [19, 23477.0] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 28711.4] - - - [1536, 2048, 1, 1024, 1536, 1536, 1536, 1024] - - [2, 31338.4] - - - [2048, 2048, 1, 1024, 2048, 2048, 2048, 1024] - - [2, 32280.9] - - - [2560, 2048, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 33493.4] - - - [3072, 2048, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 33769.3] - - - [512, 2560, 1, 1024, 512, 512, 512, 1024] - - [6, 24833.8] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [2, 29926.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1536, 1024] - - [6, 31938.4] - - - [2048, 2560, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 33211.2] - - - [2560, 2560, 1, 1024, 2560, 2560, 2560, 1024] - - [2, 34754.8] - - - [3072, 2560, 1, 1024, 3072, 3072, 3072, 1024] - - [14, 35387.6] - - - [512, 3072, 1, 1024, 512, 512, 512, 1024] - - [14, 26864.8] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [14, 31058.4] - - - [1536, 3072, 1, 1024, 1536, 1536, 1536, 1024] - - [14, 32940.1] - - - [2048, 3072, 1, 1024, 2048, 2048, 2048, 1024] - - [14, 34567.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 2560, 1024] - - [14, 34955.1] - - - [3072, 3072, 1, 1024, 3072, 3072, 3072, 1024] - - [2, 35700.4] - - - [512, 512, 1, 2048, 512, 512, 512, 2048] - - [12, 15600.1] - - - [1024, 512, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 23765.9] - - - [1536, 512, 1, 2048, 1536, 1536, 1536, 2048] - - [19, 20093.5] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 27942.4] - - - [2560, 512, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 28709.5] - - - [3072, 512, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 30761.4] - - - [512, 1024, 1, 2048, 512, 512, 512, 2048] - - [6, 23871.8] - - - [1024, 1024, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 28220.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 30778.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 32535.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 33459.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34043.7] - - - [512, 1536, 1, 2048, 512, 512, 512, 2048] - - [14, 24983.7] - - - [1024, 1536, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 30956.8] - - - [1536, 1536, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 32333.8] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 33769.3] - - - [2560, 1536, 1, 2048, 2560, 2560, 2560, 2048] - - [14, 34579.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 34607.7] - - - [512, 2048, 1, 2048, 512, 512, 512, 2048] - - [15, 27721.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 32373.6] - - - [1536, 2048, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34114.2] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [2, 34644.4] - - - [2560, 2048, 1, 2048, 2560, 2560, 2560, 2048] - - [2, 35409.6] - - - [3072, 2048, 1, 2048, 3072, 3072, 3072, 2048] - - [19, 36021.8] - - - [512, 2560, 1, 2048, 512, 512, 512, 2048] - - [2, 28845.4] - - - [1024, 2560, 1, 2048, 1024, 1024, 1024, 2048] - - [2, 33035.2] - - - [1536, 2560, 1, 2048, 1536, 1536, 1536, 2048] - - [2, 34694.1] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35699.2] - - - [2560, 2560, 1, 2048, 2560, 2560, 2560, 2048] - - [19, 35409.2] - - - [3072, 2560, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 35717.4] - - - [512, 3072, 1, 2048, 512, 512, 512, 2048] - - [2, 30754.1] - - - [1024, 3072, 1, 2048, 1024, 1024, 1024, 2048] - - [19, 34233.1] - - - [1536, 3072, 1, 2048, 1536, 1536, 1536, 2048] - - [14, 35127.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [19, 35648.4] - - - [2560, 3072, 1, 2048, 2560, 2560, 2560, 2048] - - [6, 35884.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 3072, 2048] - - [6, 37382.1] - - - [512, 512, 1, 3072, 512, 512, 512, 3072] - - [12, 18127.9] - - - [1024, 512, 1, 3072, 1024, 1024, 1024, 3072] - - [19, 26442.7] - - - [1536, 512, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 26322.2] - - - [2048, 512, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 30968.7] - - - [2560, 512, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 30018.1] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 32424.1] - - - [512, 1024, 1, 3072, 512, 512, 512, 3072] - - [2, 27073.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 30600.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 32482.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 2048, 3072] - - [19, 33371.5] - - - [2560, 1024, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 34360.5] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 34915.3] - - - [512, 1536, 1, 3072, 512, 512, 512, 3072] - - [2, 27553.8] - - - [1024, 1536, 1, 3072, 1024, 1024, 1024, 3072] - - [2, 32703.2] - - - [1536, 1536, 1, 3072, 1536, 1536, 1536, 3072] - - [6, 33662.8] - - - [2048, 1536, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 34536.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 35147.2] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [19, 35200.9] - - - [512, 2048, 1, 3072, 512, 512, 512, 3072] - - [2, 31780.0] - - - [1024, 2048, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 33546.5] - - - [1536, 2048, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 34553.2] - - - [2048, 2048, 1, 3072, 2048, 2048, 2048, 3072] - - [14, 35790.3] - - - [2560, 2048, 1, 3072, 2560, 2560, 2560, 3072] - - [2, 36033.7] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 36455.0] - - - [512, 2560, 1, 3072, 512, 512, 512, 3072] - - [6, 31027.4] - - - [1024, 2560, 1, 3072, 1024, 1024, 1024, 3072] - - [6, 35006.1] - - - [1536, 2560, 1, 3072, 1536, 1536, 1536, 3072] - - [19, 35047.6] - - - [2048, 2560, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 36074.1] - - - [2560, 2560, 1, 3072, 2560, 2560, 2560, 3072] - - [14, 36488.0] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [14, 36276.3] - - - [512, 3072, 1, 3072, 512, 512, 512, 3072] - - [2, 33344.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 1024, 3072] - - [14, 35036.4] - - - [1536, 3072, 1, 3072, 1536, 1536, 1536, 3072] - - [14, 36277.7] - - - [2048, 3072, 1, 3072, 2048, 2048, 2048, 3072] - - [2, 35989.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 2560, 3072] - - [6, 36792.8] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [2, 37505.2] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [13, 6.53659e-05] - - - [1, 1, 1, 64, 1, 1, 1, 64] - - [23, 0.00415476] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [5, 0.00401506] - - - [64, 1, 1, 1, 64, 64, 64, 1] - - [12, 0.00426141] - - - [64, 64, 1, 1, 64, 64, 64, 1] - - [18, 0.268344] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [22, 0.267642] - - - [1, 64, 1, 64, 1, 1, 1, 64] - - [22, 0.266519] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [9, 16.5652] - - - [64, 64, 1, 256, 64, 64, 64, 256] - - [10, 65.3522] - - - [64, 64, 1, 512, 64, 64, 64, 512] - - [22, 116.908] - - - [64, 64, 1, 1024, 64, 64, 64, 1024] - - [22, 202.047] - - - [64, 64, 1, 2048, 64, 64, 64, 2048] - - [22, 318.541] - - - [64, 64, 1, 4096, 64, 64, 64, 4096] - - [21, 435.687] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 9146dde94c83..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,8050 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS0_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 8 - LVCA: 32 - LVCB: 16 - LVPA: 4 - LVPB: 8 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14464 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB256_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 12928 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15488 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2176 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14976 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS0_WG16_8_1 - LSCA: 32 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 32 - LVCB: 64 - LVPA: 4 - LVPB: 2 - LdsBlockSizePerPadA: 512 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 2176 - LdsNumElementsAlignedB: 4224 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2176 - LdsOffsetB_Blk: 10368 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2176 - LdsOffsetMetadata_Blk: 10368 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA512_LBSPPB1024_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS0_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 25856 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_LDSB0_CLR0_GSU1_LBSPPA128_LBSPPB512_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 64 - LVCB: 32 - LVPA: 2 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 512 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 30976 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 4352 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB512_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 27392 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 18944 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 18944 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 1024 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 29952 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 8448 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_LDSB0_CLR1_GSU1_LBSPPA128_LBSPPB1024_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 512] - - [4, 1083.17] - - - [1024, 512, 1, 64, 1024, 1024, 64, 512] - - [6, 2149.62] - - - [1536, 512, 1, 64, 1536, 1536, 64, 512] - - [3, 2862.44] - - - [2048, 512, 1, 64, 2048, 2048, 64, 512] - - [1, 3757.71] - - - [2560, 512, 1, 64, 2560, 2560, 64, 512] - - [9, 4330.39] - - - [3072, 512, 1, 64, 3072, 3072, 64, 512] - - [1, 5161.69] - - - [512, 1024, 1, 64, 512, 512, 64, 1024] - - [19, 2075.04] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 1024] - - [20, 3608.97] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 1024] - - [23, 4639.72] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 1024] - - [1, 6143.81] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 1024] - - [1, 7283.99] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 1024] - - [6, 7999.47] - - - [512, 1536, 1, 64, 512, 512, 64, 1536] - - [6, 2970.91] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 1536] - - [8, 5152.58] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 1536] - - [6, 6800.96] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 1536] - - [8, 8088.0] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 1536] - - [2, 9133.94] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 1536] - - [6, 10264.6] - - - [512, 2048, 1, 64, 512, 512, 64, 2048] - - [19, 3696.54] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 2048] - - [1, 6298.34] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 2048] - - [19, 7790.22] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 2048] - - [7, 9700.44] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 2048] - - [9, 10984.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 2048] - - [9, 12022.0] - - - [512, 2560, 1, 64, 512, 512, 64, 2560] - - [20, 4391.71] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 2560] - - [6, 7101.62] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 2560] - - [1, 9360.37] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 2560] - - [1, 11006.0] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 2560] - - [8, 12365.6] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 2560] - - [7, 13479.1] - - - [512, 3072, 1, 64, 512, 512, 64, 3072] - - [0, 4434.99] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 3072] - - [6, 8175.2] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 3072] - - [6, 10493.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 3072] - - [7, 11785.7] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 3072] - - [7, 13452.3] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 3072] - - [6, 14621.4] - - - [512, 512, 1, 256, 512, 512, 256, 512] - - [13, 3950.6] - - - [1024, 512, 1, 256, 1024, 1024, 256, 512] - - [21, 7257.17] - - - [1536, 512, 1, 256, 1536, 1536, 256, 512] - - [5, 7936.24] - - - [2048, 512, 1, 256, 2048, 2048, 256, 512] - - [1, 11596.0] - - - [2560, 512, 1, 256, 2560, 2560, 256, 512] - - [21, 12681.2] - - - [3072, 512, 1, 256, 3072, 3072, 256, 512] - - [22, 13454.7] - - - [512, 1024, 1, 256, 512, 512, 256, 1024] - - [27, 5362.92] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 1024] - - [21, 11518.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 1024] - - [6, 14363.5] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 1024] - - [6, 16690.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 1024] - - [1, 18860.1] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 1024] - - [6, 19744.2] - - - [512, 1536, 1, 256, 512, 512, 256, 1536] - - [19, 9025.27] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 1536] - - [1, 14474.8] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 1536] - - [6, 17097.1] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 1536] - - [6, 20107.5] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 1536] - - [6, 21992.8] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 1536] - - [6, 23393.3] - - - [512, 2048, 1, 256, 512, 512, 256, 2048] - - [19, 11514.2] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 2048] - - [6, 16835.1] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 2048] - - [6, 20200.8] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 2048] - - [6, 21977.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 2048] - - [6, 24251.5] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 2048] - - [6, 25706.9] - - - [512, 2560, 1, 256, 512, 512, 256, 2560] - - [6, 8237.86] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 2560] - - [6, 18794.3] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 2560] - - [6, 21764.1] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 2560] - - [6, 23881.9] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 2560] - - [21, 25509.3] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 2560] - - [6, 27707.3] - - - [512, 3072, 1, 256, 512, 512, 256, 3072] - - [8, 14498.3] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 3072] - - [6, 19817.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 3072] - - [6, 22961.1] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 3072] - - [6, 25392.4] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 3072] - - [8, 27016.5] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 3072] - - [8, 28086.5] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [26, 6889.5] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [1, 12085.4] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [14, 9874.52] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [6, 17180.4] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [6, 18934.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [1, 21194.2] - - - [512, 1024, 1, 512, 512, 512, 512, 1024] - - [6, 12156.6] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 1024] - - [21, 17796.6] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 1024] - - [1, 21025.2] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 1024] - - [6, 23354.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 1024] - - [21, 24928.3] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 1024] - - [1, 26813.2] - - - [512, 1536, 1, 512, 512, 512, 512, 1536] - - [21, 14516.6] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 1536] - - [6, 21185.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 1536] - - [6, 24108.8] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 1536] - - [6, 26635.3] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 1536] - - [21, 27782.2] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 1536] - - [6, 29862.3] - - - [512, 2048, 1, 512, 512, 512, 512, 2048] - - [19, 12205.1] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 2048] - - [6, 23129.8] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 2048] - - [6, 26490.8] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 2048] - - [6, 28155.0] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 2048] - - [1, 30072.6] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 2048] - - [1, 30680.4] - - - [512, 2560, 1, 512, 512, 512, 512, 2560] - - [6, 18757.3] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 2560] - - [6, 19614.2] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 2560] - - [6, 27576.2] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 2560] - - [1, 30077.5] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 2560] - - [6, 31499.1] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 2560] - - [6, 32458.2] - - - [512, 3072, 1, 512, 512, 512, 512, 3072] - - [21, 21039.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 3072] - - [1, 26199.0] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 3072] - - [6, 29460.8] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 3072] - - [1, 30827.6] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 3072] - - [1, 31993.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 3072] - - [6, 33276.4] - - - [512, 512, 1, 1024, 512, 512, 1024, 512] - - [12, 10971.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 512] - - [21, 18310.7] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 512] - - [1, 19266.4] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 512] - - [19, 18428.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 512] - - [1, 24497.7] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 512] - - [1, 26869.1] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [21, 18273.3] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [6, 24337.7] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [19, 26873.8] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [19, 29022.4] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [19, 30283.1] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [1, 31153.2] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1536] - - [6, 20246.8] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1536] - - [1, 27449.5] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1536] - - [21, 28920.3] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1536] - - [6, 31424.3] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1536] - - [1, 31814.4] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1536] - - [1, 33218.0] - - - [512, 2048, 1, 1024, 512, 512, 1024, 2048] - - [21, 24365.6] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 2048] - - [21, 29038.1] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 2048] - - [1, 31145.5] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 2048] - - [19, 32352.7] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 2048] - - [19, 33927.1] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 2048] - - [1, 33613.0] - - - [512, 2560, 1, 1024, 512, 512, 1024, 2560] - - [6, 24540.4] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 2560] - - [22, 29472.2] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 2560] - - [19, 32023.7] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 2560] - - [6, 33480.9] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 2560] - - [1, 33714.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 2560] - - [21, 34741.4] - - - [512, 3072, 1, 1024, 512, 512, 1024, 3072] - - [1, 27182.6] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 3072] - - [6, 31539.8] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 3072] - - [21, 33705.5] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 3072] - - [19, 34361.0] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 3072] - - [19, 35487.5] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 3072] - - [8, 35326.3] - - - [512, 512, 1, 2048, 512, 512, 2048, 512] - - [28, 15679.9] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 512] - - [21, 24296.6] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 512] - - [21, 24278.2] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 512] - - [6, 29739.2] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 512] - - [6, 28423.0] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 512] - - [21, 31795.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 1024] - - [21, 24437.9] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 1024] - - [6, 29380.4] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 1024] - - [21, 31789.3] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 1024] - - [20, 31191.0] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 1024] - - [21, 33912.0] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 1024] - - [6, 35255.0] - - - [512, 1536, 1, 2048, 512, 512, 2048, 1536] - - [6, 24873.6] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 1536] - - [6, 31882.3] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 1536] - - [21, 31570.7] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 1536] - - [21, 34462.4] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 1536] - - [21, 34598.5] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 1536] - - [6, 35573.9] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [6, 29667.1] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [6, 33209.9] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [6, 34438.3] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [6, 35818.1] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [6, 36040.1] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [6, 36782.1] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2560] - - [22, 27572.0] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2560] - - [1, 33165.8] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2560] - - [6, 34165.4] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2560] - - [6, 35669.4] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2560] - - [21, 35789.5] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2560] - - [6, 36272.2] - - - [512, 3072, 1, 2048, 512, 512, 2048, 3072] - - [19, 31784.6] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 3072] - - [6, 35267.6] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 3072] - - [6, 35707.6] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 3072] - - [21, 35920.7] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 3072] - - [21, 36186.1] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 3072] - - [6, 36613.0] - - - [512, 512, 1, 3072, 512, 512, 3072, 512] - - [28, 18056.6] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 512] - - [6, 27553.0] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 512] - - [19, 26363.9] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 512] - - [6, 32272.8] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 512] - - [19, 30756.5] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 512] - - [21, 33875.9] - - - [512, 1024, 1, 3072, 512, 512, 3072, 1024] - - [21, 27610.0] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 1024] - - [21, 31679.6] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 1024] - - [1, 33794.0] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 1024] - - [6, 34582.0] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 1024] - - [6, 34984.0] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 1024] - - [6, 36031.0] - - - [512, 1536, 1, 3072, 512, 512, 3072, 1536] - - [6, 23558.9] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 1536] - - [1, 33291.3] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 1536] - - [8, 33039.3] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 1536] - - [21, 35348.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 1536] - - [1, 35120.0] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 1536] - - [21, 35699.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 2048] - - [1, 32285.7] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 2048] - - [6, 34666.7] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 2048] - - [6, 35985.3] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 2048] - - [21, 35977.0] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 2048] - - [6, 37045.4] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 2048] - - [1, 36987.5] - - - [512, 2560, 1, 3072, 512, 512, 3072, 2560] - - [19, 29945.3] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 2560] - - [19, 35250.2] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 2560] - - [6, 35889.4] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 2560] - - [21, 36970.0] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 2560] - - [8, 36851.2] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 2560] - - [21, 36784.3] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [6, 33508.8] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [19, 36071.5] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [6, 36754.1] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [6, 37061.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [6, 36768.6] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [6, 37017.4] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [15, 6.66533e-05] - - - [1, 1, 1, 64, 1, 1, 64, 1] - - [15, 0.00412252] - - - [1, 64, 1, 1, 1, 1, 1, 64] - - [16, 0.00426852] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [25, 0.00407916] - - - [64, 64, 1, 1, 64, 64, 1, 64] - - [17, 0.266164] - - - [64, 1, 1, 64, 64, 64, 64, 1] - - [24, 0.271564] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [18, 0.270658] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [11, 17.2707] - - - [64, 64, 1, 256, 64, 64, 256, 64] - - [11, 63.0286] - - - [64, 64, 1, 512, 64, 64, 512, 64] - - [10, 115.644] - - - [64, 64, 1, 1024, 64, 64, 1024, 64] - - [24, 203.124] - - - [64, 64, 1, 2048, 64, 64, 2048, 64] - - [10, 317.817] - - - [64, 64, 1, 4096, 64, 64, 4096, 64] - - [10, 442.484] -- null -- null -- DeviceEfficiency -- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml deleted file mode 100644 index 9d592a746dfe..000000000000 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/GridBased/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_HAS_AuxH_SAV_UserArgs.yaml +++ /dev/null @@ -1,9086 +0,0 @@ -- {MinimumRequiredVersion: 4.33.0} -- gfx1200 -- gfx1200 -- [Device 73f0] -- Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false -- - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 9 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG16_8_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 10 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 11 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SVW8_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 12 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS0_SU32_SUM0_SUS256_SVW8_TLDS1_WG32_4_1_WGM8 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 8 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG64_2_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [4, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 16 - MacroTileA: 64 - MacroTileB: 16 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 13 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x16x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG64_2_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 8 - SubGroup1: 16 - SubGroupA: 8 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [64, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 14 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 15 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 16 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 17 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 18 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 19 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 13312 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 13312 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 8 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 20 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 21 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 14592 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 9472 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 9472 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 22 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT16x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 23 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 24 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 25 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 4 - LVCA: 32 - LVCB: 32 - LVPA: 4 - LVPB: 4 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 15872 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 8 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 26 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x32_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR2_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 27 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 26624 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 32 - MacroTileA: 32 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 28 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x32x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT1_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 29 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR1_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG32_4_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 10240 - LdsNumElementsAlignedB: 5120 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 10240 - LdsOffsetB_Blk: 26624 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 10240 - LdsOffsetMetadata_Blk: 26624 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [2, 2] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 64 - MacroTile1: 32 - MacroTileA: 64 - MacroTileB: 32 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 32 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 16 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 30 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT64x32x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG32_4_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 4 - SubGroup1: 32 - SubGroupA: 4 - SubGroupB: 32 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [32, 4, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 1 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 3 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 31 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR1_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR3_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BufferLoad: true - BufferStore: true - CUCount: null - ClusterLocalRead: 0 - CodeObjectVersion: default - ConvertAfterDS: false - CustomKernelName: '' - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {SupportCustomStaggerU: true, SupportCustomWGM: true, SupportUserGSU: true, - UseUniversalArgs: true} - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SVW1_TLDS1_WG16_8_1 - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 64 - LVCB: 64 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsInitCVgprs: false - LdsNumBytes: 31744 - LdsNumElementsAlignedA: 5120 - LdsNumElementsAlignedB: 10240 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 16384 - LdsOffsetB: 5120 - LdsOffsetB_Blk: 21504 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 21504 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: false - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 4] - MIWaveTile: [2, 1] - MIWaveTileA: 2 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 32 - MacroTile1: 64 - MacroTileA: 32 - MacroTileB: 64 - MagicDivAlg: 2 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxOccupancy: 40 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 16 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 32 - NumThreads: 128 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0, 4] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: "" - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 32 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HAS_SAV_UserArgs_MT32x64x64_MI16x16x1_SN_CLR0_GSU1_LBSPPA128_LBSPPB128_MIWT2_1_PGR1_PLR0_SS1_SU32_SUM0_SUS256_SVW1_TLDS1_WG16_8_1_WGM8 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSyncOpt: 0 - StoreVectorWidth: 1 - SubGroup0: 2 - SubGroup1: 64 - SubGroupA: 2 - SubGroupB: 64 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 16 - ThreadTile1: 1 - ThreadTileA: 16 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WavefrontSize: 32 - WorkGroup: [16, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 -- [2, 3, 0, 1] -- - - [512, 512, 1, 64, 512, 512, 64, 64] - - [26, 1116.36] - - - [1024, 512, 1, 64, 1024, 1024, 64, 64] - - [24, 2109.68] - - - [1536, 512, 1, 64, 1536, 1536, 64, 64] - - [6, 2920.74] - - - [2048, 512, 1, 64, 2048, 2048, 64, 64] - - [24, 3812.35] - - - [2560, 512, 1, 64, 2560, 2560, 64, 64] - - [9, 4280.56] - - - [3072, 512, 1, 64, 3072, 3072, 64, 64] - - [7, 4977.66] - - - [512, 1024, 1, 64, 512, 512, 64, 64] - - [20, 2015.7] - - - [1024, 1024, 1, 64, 1024, 1024, 64, 64] - - [5, 3695.63] - - - [1536, 1024, 1, 64, 1536, 1536, 64, 64] - - [6, 5176.29] - - - [2048, 1024, 1, 64, 2048, 2048, 64, 64] - - [26, 5979.18] - - - [2560, 1024, 1, 64, 2560, 2560, 64, 64] - - [7, 7139.54] - - - [3072, 1024, 1, 64, 3072, 3072, 64, 64] - - [7, 8016.99] - - - [512, 1536, 1, 64, 512, 512, 64, 64] - - [4, 2992.1] - - - [1024, 1536, 1, 64, 1024, 1024, 64, 64] - - [8, 4945.87] - - - [1536, 1536, 1, 64, 1536, 1536, 64, 64] - - [23, 6493.29] - - - [2048, 1536, 1, 64, 2048, 2048, 64, 64] - - [3, 8039.4] - - - [2560, 1536, 1, 64, 2560, 2560, 64, 64] - - [3, 9563.3] - - - [3072, 1536, 1, 64, 3072, 3072, 64, 64] - - [7, 10179.8] - - - [512, 2048, 1, 64, 512, 512, 64, 64] - - [9, 3660.25] - - - [1024, 2048, 1, 64, 1024, 1024, 64, 64] - - [3, 6320.59] - - - [1536, 2048, 1, 64, 1536, 1536, 64, 64] - - [3, 8217.08] - - - [2048, 2048, 1, 64, 2048, 2048, 64, 64] - - [3, 9524.73] - - - [2560, 2048, 1, 64, 2560, 2560, 64, 64] - - [3, 11184.3] - - - [3072, 2048, 1, 64, 3072, 3072, 64, 64] - - [3, 11892.1] - - - [512, 2560, 1, 64, 512, 512, 64, 64] - - [4, 4524.6] - - - [1024, 2560, 1, 64, 1024, 1024, 64, 64] - - [3, 7363.92] - - - [1536, 2560, 1, 64, 1536, 1536, 64, 64] - - [7, 9451.78] - - - [2048, 2560, 1, 64, 2048, 2048, 64, 64] - - [3, 11067.7] - - - [2560, 2560, 1, 64, 2560, 2560, 64, 64] - - [4, 12345.4] - - - [3072, 2560, 1, 64, 3072, 3072, 64, 64] - - [3, 13638.3] - - - [512, 3072, 1, 64, 512, 512, 64, 64] - - [5, 5016.11] - - - [1024, 3072, 1, 64, 1024, 1024, 64, 64] - - [6, 7863.24] - - - [1536, 3072, 1, 64, 1536, 1536, 64, 64] - - [8, 10419.2] - - - [2048, 3072, 1, 64, 2048, 2048, 64, 64] - - [7, 12251.0] - - - [2560, 3072, 1, 64, 2560, 2560, 64, 64] - - [8, 13673.5] - - - [3072, 3072, 1, 64, 3072, 3072, 64, 64] - - [3, 14721.3] - - - [512, 512, 1, 256, 512, 512, 256, 256] - - [23, 3824.3] - - - [1024, 512, 1, 256, 1024, 1024, 256, 256] - - [21, 6310.04] - - - [1536, 512, 1, 256, 1536, 1536, 256, 256] - - [3, 9379.08] - - - [2048, 512, 1, 256, 2048, 2048, 256, 256] - - [7, 11175.5] - - - [2560, 512, 1, 256, 2560, 2560, 256, 256] - - [3, 12621.6] - - - [3072, 512, 1, 256, 3072, 3072, 256, 256] - - [7, 14480.3] - - - [512, 1024, 1, 256, 512, 512, 256, 256] - - [7, 7320.5] - - - [1024, 1024, 1, 256, 1024, 1024, 256, 256] - - [7, 11689.9] - - - [1536, 1024, 1, 256, 1536, 1536, 256, 256] - - [32, 11671.1] - - - [2048, 1024, 1, 256, 2048, 2048, 256, 256] - - [7, 16930.9] - - - [2560, 1024, 1, 256, 2560, 2560, 256, 256] - - [3, 18384.2] - - - [3072, 1024, 1, 256, 3072, 3072, 256, 256] - - [3, 20444.7] - - - [512, 1536, 1, 256, 512, 512, 256, 256] - - [3, 9420.77] - - - [1024, 1536, 1, 256, 1024, 1024, 256, 256] - - [7, 14561.4] - - - [1536, 1536, 1, 256, 1536, 1536, 256, 256] - - [7, 17782.7] - - - [2048, 1536, 1, 256, 2048, 2048, 256, 256] - - [7, 20398.1] - - - [2560, 1536, 1, 256, 2560, 2560, 256, 256] - - [7, 22130.6] - - - [3072, 1536, 1, 256, 3072, 3072, 256, 256] - - [7, 23282.6] - - - [512, 2048, 1, 256, 512, 512, 256, 256] - - [23, 11511.4] - - - [1024, 2048, 1, 256, 1024, 1024, 256, 256] - - [7, 17019.7] - - - [1536, 2048, 1, 256, 1536, 1536, 256, 256] - - [7, 20531.0] - - - [2048, 2048, 1, 256, 2048, 2048, 256, 256] - - [7, 22318.2] - - - [2560, 2048, 1, 256, 2560, 2560, 256, 256] - - [3, 24464.2] - - - [3072, 2048, 1, 256, 3072, 3072, 256, 256] - - [7, 26127.0] - - - [512, 2560, 1, 256, 512, 512, 256, 256] - - [23, 12787.8] - - - [1024, 2560, 1, 256, 1024, 1024, 256, 256] - - [23, 17516.4] - - - [1536, 2560, 1, 256, 1536, 1536, 256, 256] - - [7, 22099.0] - - - [2048, 2560, 1, 256, 2048, 2048, 256, 256] - - [3, 24493.2] - - - [2560, 2560, 1, 256, 2560, 2560, 256, 256] - - [23, 26039.0] - - - [3072, 2560, 1, 256, 3072, 3072, 256, 256] - - [7, 27133.3] - - - [512, 3072, 1, 256, 512, 512, 256, 256] - - [31, 11673.0] - - - [1024, 3072, 1, 256, 1024, 1024, 256, 256] - - [7, 20336.3] - - - [1536, 3072, 1, 256, 1536, 1536, 256, 256] - - [7, 23677.8] - - - [2048, 3072, 1, 256, 2048, 2048, 256, 256] - - [23, 25872.3] - - - [2560, 3072, 1, 256, 2560, 2560, 256, 256] - - [3, 27085.7] - - - [3072, 3072, 1, 256, 3072, 3072, 256, 256] - - [3, 28148.9] - - - [512, 512, 1, 512, 512, 512, 512, 512] - - [29, 6901.9] - - - [1024, 512, 1, 512, 1024, 1024, 512, 512] - - [22, 10023.5] - - - [1536, 512, 1, 512, 1536, 1536, 512, 512] - - [3, 14241.6] - - - [2048, 512, 1, 512, 2048, 2048, 512, 512] - - [7, 17210.7] - - - [2560, 512, 1, 512, 2560, 2560, 512, 512] - - [7, 18333.8] - - - [3072, 512, 1, 512, 3072, 3072, 512, 512] - - [18, 20924.1] - - - [512, 1024, 1, 512, 512, 512, 512, 512] - - [18, 11163.9] - - - [1024, 1024, 1, 512, 1024, 1024, 512, 512] - - [3, 17257.7] - - - [1536, 1024, 1, 512, 1536, 1536, 512, 512] - - [3, 21191.4] - - - [2048, 1024, 1, 512, 2048, 2048, 512, 512] - - [3, 23518.1] - - - [2560, 1024, 1, 512, 2560, 2560, 512, 512] - - [7, 24712.4] - - - [3072, 1024, 1, 512, 3072, 3072, 512, 512] - - [3, 26455.7] - - - [512, 1536, 1, 512, 512, 512, 512, 512] - - [7, 14173.7] - - - [1024, 1536, 1, 512, 1024, 1024, 512, 512] - - [3, 21272.9] - - - [1536, 1536, 1, 512, 1536, 1536, 512, 512] - - [7, 23413.7] - - - [2048, 1536, 1, 512, 2048, 2048, 512, 512] - - [7, 26629.0] - - - [2560, 1536, 1, 512, 2560, 2560, 512, 512] - - [3, 27606.6] - - - [3072, 1536, 1, 512, 3072, 3072, 512, 512] - - [3, 29315.8] - - - [512, 2048, 1, 512, 512, 512, 512, 512] - - [3, 17921.7] - - - [1024, 2048, 1, 512, 1024, 1024, 512, 512] - - [7, 23634.6] - - - [1536, 2048, 1, 512, 1536, 1536, 512, 512] - - [3, 26315.1] - - - [2048, 2048, 1, 512, 2048, 2048, 512, 512] - - [3, 28469.1] - - - [2560, 2048, 1, 512, 2560, 2560, 512, 512] - - [18, 30072.4] - - - [3072, 2048, 1, 512, 3072, 3072, 512, 512] - - [23, 31016.6] - - - [512, 2560, 1, 512, 512, 512, 512, 512] - - [3, 18699.8] - - - [1024, 2560, 1, 512, 1024, 1024, 512, 512] - - [3, 24838.3] - - - [1536, 2560, 1, 512, 1536, 1536, 512, 512] - - [3, 27568.4] - - - [2048, 2560, 1, 512, 2048, 2048, 512, 512] - - [3, 30484.8] - - - [2560, 2560, 1, 512, 2560, 2560, 512, 512] - - [7, 31541.9] - - - [3072, 2560, 1, 512, 3072, 3072, 512, 512] - - [23, 31976.4] - - - [512, 3072, 1, 512, 512, 512, 512, 512] - - [25, 20206.2] - - - [1024, 3072, 1, 512, 1024, 1024, 512, 512] - - [3, 26139.5] - - - [1536, 3072, 1, 512, 1536, 1536, 512, 512] - - [7, 29248.4] - - - [2048, 3072, 1, 512, 2048, 2048, 512, 512] - - [3, 30635.2] - - - [2560, 3072, 1, 512, 2560, 2560, 512, 512] - - [7, 31928.0] - - - [3072, 3072, 1, 512, 3072, 3072, 512, 512] - - [3, 32401.2] - - - [512, 512, 1, 1024, 512, 512, 1024, 1024] - - [16, 11191.8] - - - [1024, 512, 1, 1024, 1024, 1024, 1024, 1024] - - [30, 15094.6] - - - [1536, 512, 1, 1024, 1536, 1536, 1024, 1024] - - [7, 20115.1] - - - [2048, 512, 1, 1024, 2048, 2048, 1024, 1024] - - [1, 18401.7] - - - [2560, 512, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 24437.0] - - - [3072, 512, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 27036.4] - - - [512, 1024, 1, 1024, 512, 512, 1024, 1024] - - [23, 18759.2] - - - [1024, 1024, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 23930.3] - - - [1536, 1024, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 27311.9] - - - [2048, 1024, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 29234.0] - - - [2560, 1024, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 30887.0] - - - [3072, 1024, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 31271.5] - - - [512, 1536, 1, 1024, 512, 512, 1024, 1024] - - [3, 15196.0] - - - [1024, 1536, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 27468.2] - - - [1536, 1536, 1, 1024, 1536, 1536, 1024, 1024] - - [9, 28538.7] - - - [2048, 1536, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 31260.8] - - - [2560, 1536, 1, 1024, 2560, 2560, 1024, 1024] - - [18, 31617.8] - - - [3072, 1536, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 33265.9] - - - [512, 2048, 1, 1024, 512, 512, 1024, 1024] - - [7, 24437.9] - - - [1024, 2048, 1, 1024, 1024, 1024, 1024, 1024] - - [23, 29042.0] - - - [1536, 2048, 1, 1024, 1536, 1536, 1024, 1024] - - [3, 31309.3] - - - [2048, 2048, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 32573.8] - - - [2560, 2048, 1, 1024, 2560, 2560, 1024, 1024] - - [7, 33352.6] - - - [3072, 2048, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 33978.9] - - - [512, 2560, 1, 1024, 512, 512, 1024, 1024] - - [3, 24450.6] - - - [1024, 2560, 1, 1024, 1024, 1024, 1024, 1024] - - [7, 30538.4] - - - [1536, 2560, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 31676.3] - - - [2048, 2560, 1, 1024, 2048, 2048, 1024, 1024] - - [18, 33474.4] - - - [2560, 2560, 1, 1024, 2560, 2560, 1024, 1024] - - [25, 34230.7] - - - [3072, 2560, 1, 1024, 3072, 3072, 1024, 1024] - - [18, 34591.6] - - - [512, 3072, 1, 1024, 512, 512, 1024, 1024] - - [7, 27123.1] - - - [1024, 3072, 1, 1024, 1024, 1024, 1024, 1024] - - [25, 30137.2] - - - [1536, 3072, 1, 1024, 1536, 1536, 1024, 1024] - - [18, 33263.7] - - - [2048, 3072, 1, 1024, 2048, 2048, 1024, 1024] - - [3, 33801.1] - - - [2560, 3072, 1, 1024, 2560, 2560, 1024, 1024] - - [3, 35197.0] - - - [3072, 3072, 1, 1024, 3072, 3072, 1024, 1024] - - [3, 35739.8] - - - [512, 512, 1, 2048, 512, 512, 2048, 2048] - - [0, 16070.1] - - - [1024, 512, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 24839.9] - - - [1536, 512, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 24644.6] - - - [2048, 512, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 29724.8] - - - [2560, 512, 1, 2048, 2560, 2560, 2048, 2048] - - [7, 28576.3] - - - [3072, 512, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 31765.6] - - - [512, 1024, 1, 2048, 512, 512, 2048, 2048] - - [30, 19501.7] - - - [1024, 1024, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 29849.0] - - - [1536, 1024, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 31696.9] - - - [2048, 1024, 1, 2048, 2048, 2048, 2048, 2048] - - [7, 33441.7] - - - [2560, 1024, 1, 2048, 2560, 2560, 2048, 2048] - - [18, 33752.7] - - - [3072, 1024, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 34739.8] - - - [512, 1536, 1, 2048, 512, 512, 2048, 2048] - - [7, 25223.0] - - - [1024, 1536, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 32174.0] - - - [1536, 1536, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 31646.4] - - - [2048, 1536, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 34694.8] - - - [2560, 1536, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34553.2] - - - [3072, 1536, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35759.2] - - - [512, 2048, 1, 2048, 512, 512, 2048, 2048] - - [3, 29652.8] - - - [1024, 2048, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33297.5] - - - [1536, 2048, 1, 2048, 1536, 1536, 2048, 2048] - - [7, 34608.8] - - - [2048, 2048, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 35139.7] - - - [2560, 2048, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 35341.3] - - - [3072, 2048, 1, 2048, 3072, 3072, 2048, 2048] - - [23, 35956.2] - - - [512, 2560, 1, 2048, 512, 512, 2048, 2048] - - [3, 28258.1] - - - [1024, 2560, 1, 2048, 1024, 1024, 2048, 2048] - - [7, 33681.7] - - - [1536, 2560, 1, 2048, 1536, 1536, 2048, 2048] - - [3, 34214.0] - - - [2048, 2560, 1, 2048, 2048, 2048, 2048, 2048] - - [3, 35566.9] - - - [2560, 2560, 1, 2048, 2560, 2560, 2048, 2048] - - [3, 34975.4] - - - [3072, 2560, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36457.3] - - - [512, 3072, 1, 2048, 512, 512, 2048, 2048] - - [23, 32028.3] - - - [1024, 3072, 1, 2048, 1024, 1024, 2048, 2048] - - [3, 35102.8] - - - [1536, 3072, 1, 2048, 1536, 1536, 2048, 2048] - - [23, 35622.4] - - - [2048, 3072, 1, 2048, 2048, 2048, 2048, 2048] - - [23, 36506.3] - - - [2560, 3072, 1, 2048, 2560, 2560, 2048, 2048] - - [23, 36578.4] - - - [3072, 3072, 1, 2048, 3072, 3072, 2048, 2048] - - [7, 36227.1] - - - [512, 512, 1, 3072, 512, 512, 3072, 3072] - - [16, 19097.1] - - - [1024, 512, 1, 3072, 1024, 1024, 3072, 3072] - - [7, 22485.2] - - - [1536, 512, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 27063.8] - - - [2048, 512, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 31945.5] - - - [2560, 512, 1, 3072, 2560, 2560, 3072, 3072] - - [25, 30201.7] - - - [3072, 512, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 33670.1] - - - [512, 1024, 1, 3072, 512, 512, 3072, 3072] - - [23, 27764.6] - - - [1024, 1024, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 31934.4] - - - [1536, 1024, 1, 3072, 1536, 1536, 3072, 3072] - - [18, 33879.6] - - - [2048, 1024, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 34652.8] - - - [2560, 1024, 1, 3072, 2560, 2560, 3072, 3072] - - [7, 35678.7] - - - [3072, 1024, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36476.6] - - - [512, 1536, 1, 3072, 512, 512, 3072, 3072] - - [19, 26411.6] - - - [1024, 1536, 1, 3072, 1024, 1024, 3072, 3072] - - [18, 33941.6] - - - [1536, 1536, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 33518.4] - - - [2048, 1536, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 35550.4] - - - [2560, 1536, 1, 3072, 2560, 2560, 3072, 3072] - - [19, 35037.3] - - - [3072, 1536, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36761.4] - - - [512, 2048, 1, 3072, 512, 512, 3072, 3072] - - [23, 32025.2] - - - [1024, 2048, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 34942.8] - - - [1536, 2048, 1, 3072, 1536, 1536, 3072, 3072] - - [7, 36511.0] - - - [2048, 2048, 1, 3072, 2048, 2048, 3072, 3072] - - [7, 36512.1] - - - [2560, 2048, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36582.2] - - - [3072, 2048, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 36507.9] - - - [512, 2560, 1, 3072, 512, 512, 3072, 3072] - - [9, 30497.2] - - - [1024, 2560, 1, 3072, 1024, 1024, 3072, 3072] - - [23, 35525.0] - - - [1536, 2560, 1, 3072, 1536, 1536, 3072, 3072] - - [9, 35295.7] - - - [2048, 2560, 1, 3072, 2048, 2048, 3072, 3072] - - [23, 36415.3] - - - [2560, 2560, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36079.3] - - - [3072, 2560, 1, 3072, 3072, 3072, 3072, 3072] - - [7, 36791.5] - - - [512, 3072, 1, 3072, 512, 512, 3072, 3072] - - [23, 33847.3] - - - [1024, 3072, 1, 3072, 1024, 1024, 3072, 3072] - - [3, 35738.6] - - - [1536, 3072, 1, 3072, 1536, 1536, 3072, 3072] - - [23, 36281.0] - - - [2048, 3072, 1, 3072, 2048, 2048, 3072, 3072] - - [3, 36700.7] - - - [2560, 3072, 1, 3072, 2560, 2560, 3072, 3072] - - [3, 36778.3] - - - [3072, 3072, 1, 3072, 3072, 3072, 3072, 3072] - - [3, 37457.0] - - - [1, 1, 1, 1, 1, 1, 1, 1] - - [17, 6.65624e-05] - - - [1, 1, 1, 64, 1, 1, 64, 64] - - [21, 0.00424868] - - - [1, 64, 1, 1, 1, 1, 1, 1] - - [13, 0.00424305] - - - [64, 1, 1, 1, 64, 64, 1, 1] - - [15, 0.00427565] - - - [64, 64, 1, 1, 64, 64, 1, 1] - - [2, 0.269944] - - - [64, 1, 1, 64, 64, 64, 64, 64] - - [10, 0.250505] - - - [1, 64, 1, 64, 1, 1, 64, 64] - - [28, 0.271195] - - - [64, 64, 1, 64, 64, 64, 64, 64] - - [12, 17.4611] - - - [64, 64, 1, 256, 64, 64, 256, 256] - - [11, 64.9052] - - - [64, 64, 1, 512, 64, 64, 512, 512] - - [14, 118.493] - - - [64, 64, 1, 1024, 64, 64, 1024, 1024] - - [11, 203.869] - - - [64, 64, 1, 2048, 64, 64, 2048, 2048] - - [11, 322.713] - - - [64, 64, 1, 4096, 64, 64, 4096, 4096] - - [27, 447.864] -- null -- null -- DeviceEfficiency -- GridBased From ede63ca4e6cde691dccfca3a3f5f4361e2b4e757 Mon Sep 17 00:00:00 2001 From: wencchen Date: Tue, 24 Jun 2025 03:14:31 +0000 Subject: [PATCH 3/5] fix yaml type from gridbase to equality --- ...200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- ...201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 5afc7bd9737c..8b0b13759eeb 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 36b42ed1661e..1c69ca18fd60 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 9484935138d3..380216396ecd 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 25886fcde524..8a14a09357f7 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 5ea3910e819d..2241b2707df3 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 3ccbd40fa0ac..8c76c56dc520 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -1908,4 +1908,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 1941ec3f8558..5bbbe53cced5 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index bae1e6c44bf9..6a09fdb3faab 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -2210,4 +2210,4 @@ - null - null - DeviceEfficiency -- GridBased +- Equality From cf1a0edc5a9e05ab3e0cdee2e0f6c3eba1b90c32 Mon Sep 17 00:00:00 2001 From: Wen-Chuan Chen Date: Tue, 24 Jun 2025 04:33:55 -0400 Subject: [PATCH 4/5] keep only size=128 kernel gfx1201 --- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1888 +---------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1576 +------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1874 +--------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1854 +--------------- 4 files changed, 99 insertions(+), 7093 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 2241b2707df3..64178fba7f50 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,1829 +89,17 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA_gVzqyXY4lVMFx31MqSbB0ud3S5R9EQWi12BfYDWDLk= + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAnKrZLw0Nlt1oQtI32D3BKNxykE_z_QRF1ZHflDCgI_Q= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAIVneIeXsqZD4OaZjJp8hvawKFBg-ime18ufUnc8u8cc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAKI8wQM_FheExAvx0QJ-zMY1craZcEag1K1coINKPPkw= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 8 - LVCA: 2 - LVCB: 4 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAW-f1a6oJ3DMPUYY6SGvmU2bXZUxM9tH-ICrRLM0ejig= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 2 - LVCB: 2 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAuu_CKIJDCq41BZOvU4z4gH484wfIlTBh3jGK8bJ6M-w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserARtOO83QKIFvzwyHltdOqn_fj7DOrstvd0ULWV0Af0fc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA-m9AcsWte2XsapAmRUjnuiJni1_0NG83An71gKU_3Qk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' DebugStreamK: 0 - DepthU: 32 + DepthU: 64 DirectToLds: false DirectToLdsA: false DirectToLdsB: false @@ -1926,7 +114,7 @@ ExpertSchedulingMode: 2 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer @@ -1934,7 +122,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [12, 0, 1] @@ -1944,34 +132,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 16 LSCB: 16 - LSPA: 2 + LSPA: 8 LSPB: 8 - LVCA: 16 + LVCA: 4 LVCB: 4 LVPA: 2 LVPB: 2 LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2304 + LdsBytesNoAmax: 12800 LdsInitCVgprs: false - LdsNumBytes: 2304 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 + LdsNumBytes: 12800 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2304 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 5248 + LdsOffsetMetadata_Blk: 10496 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -1981,8 +169,8 @@ LocalWritePerMfma: -1 LocalWriteUseSgprA: false LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 + LoopIters: 4 + LoopUnroll: 64 MFMA_BF16_1K: 0 MIArchVgpr: true MIBlock: [16, 16, 16, 1, 1, 1] @@ -2029,12 +217,12 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsA: 8 + NumLoadsB: 8 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -2125,9 +313,9 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -2135,7 +323,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -2176,16 +364,16 @@ WorkGroupMappingXCCGroup: -1 WorkGroupReduction: false WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 + _DepthU: 64 + _DepthUA: 64 + _DepthUB: 64 + _DepthUMetadata: 64 _GlobalAccumulation: MultipleBuffer _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 + _staggerStrideShift: 1 enableLDSTrA: false enableLDSTrB: false reorderGRInstForDTVA: false @@ -2194,19 +382,7 @@ tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [2, 0.08] - - - [129, 128, 1, 128, 129, 129, 129, 128] - - [0, 0.07] - - - [128, 129, 1, 128, 128, 128, 128, 129] - - [1, 0.07] - - - [128, 128, 1, 129, 128, 128, 128, 128] - - [3, 0.08] - - - [129, 129, 1, 128, 129, 129, 129, 129] - - [5, 0.07] - - - [129, 128, 1, 129, 129, 129, 129, 128] - - [6, 0.07] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [4, 0.07] + - [0, 0.08] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 8c76c56dc520..eba2ec731d00 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,12 +89,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAssQcDAYsuGnrJ0RCDOfoZlcyUNjLyP5y1_KdtcmJf0E= + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAvPbW1EldmDkYG10epNpkVm-irjGzqxm08jz6SByNeQs= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' @@ -111,18 +111,18 @@ EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 + ExpertSchedulingMode: 2 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthA: 4 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [12, 0, 1] @@ -132,24 +132,24 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 16 LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 16 - LVCB: 8 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 LVPA: 2 LVPB: 1 LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6528 + LdsBytesNoAmax: 6400 LdsInitCVgprs: false - LdsNumBytes: 6528 + LdsNumBytes: 6400 LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1280 + LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 @@ -217,12 +217,12 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsA: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 2 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -230,8 +230,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: 0 ProblemType: Activation: true @@ -311,10 +311,10 @@ UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 + ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -337,1522 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 1 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlwVP7gyExHTNKoV_9-NqHQcY8x8JDQiBg2yf4PAHPvk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 1 - LVCA: 16 - LVCB: 32 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserABPLzPwkl1b26M47S18lTb0flFRVJhwDGzlzvnE61UQE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 8 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6528 - LdsInitCVgprs: false - LdsNumBytes: 6528 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAJwyqJfxSdPQhtNS5fOq2WVNAyT1YsraJu8houvostjU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 8 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAQNGo8mAAKe45Vr7xTXHP-SVujPWAE_eB4_O2SOqbvY0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 1 - LVCA: 16 - LVCB: 32 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA2fX9lfk9GAJdIriBwmfehppLN1D9ED3LkNtP7zSIUlc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -1879,7 +369,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -1892,19 +382,7 @@ tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [3, 0.08] - - - [129, 128, 1, 128, 129, 129, 129, 128] - - [5, 0.07] - - - [128, 129, 1, 128, 128, 128, 128, 128] - - [1, 0.08] - - - [128, 128, 1, 129, 128, 128, 128, 129] - [0, 0.08] - - - [129, 129, 1, 128, 129, 129, 129, 128] - - [4, 0.07] - - - [129, 128, 1, 129, 129, 129, 129, 129] - - [5, 0.07] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [2, 0.08] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 5bbbe53cced5..8e40467e7f4b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -78,7 +78,7 @@ UseScaleAB: '' UseScaleAlphaVec: 1 UseScaleCD: false -- - 1LDSBuffer: 1 +- - 1LDSBuffer: 0 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -89,12 +89,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAXq3Qh7OtsBZWdD5vOEmSIdhiSav2Ren-TgRqSFfMpZc= + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoP3i3lF397797qrIx8MG_jBevK8ik06WqHbC1LQ2U58= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' @@ -115,7 +115,7 @@ ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 1 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -123,7 +123,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: false + GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [12, 0, 1] InnerUnroll: 1 @@ -132,34 +132,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 32 LSCB: 16 LSPA: 8 - LSPB: 8 + LSPB: 2 LVCA: 4 - LVCB: 4 + LVCB: 16 LVPA: 1 LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2304 + LdsBytesNoAmax: 6656 LdsInitCVgprs: false - LdsNumBytes: 2304 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 + LdsNumBytes: 6656 + LdsNumElementsAlignedA: 1280 + LdsNumElementsAlignedB: 1280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 + LdsOffsetB: 1280 + LdsOffsetB_Blk: 5376 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 5248 + LdsOffsetMetadata: 1280 + LdsOffsetMetadata_Blk: 5376 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -218,11 +218,11 @@ NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 2 - NumLoadsB: 4 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 16 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -314,7 +314,7 @@ ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -337,12 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 0 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -369,7 +369,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false + _UseSgprForGRO: 1 _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -380,1833 +380,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAg3Er7LpiQJHElQtk-5iCTSz-wu8ScqhO65gaV9kLXIg= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 2 - LVCA: 8 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2560 - LdsInitCVgprs: false - LdsNumBytes: 2560 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA75TZ_95242teuBaf0bfjFy1tkVPH7kWdr2Kcg4JFgJs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 2 - LVCA: 8 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2432 - LdsInitCVgprs: false - LdsNumBytes: 2432 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2432 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA0zZWwdGXwF0u_qsM45G41-beHeGkGA1Dzf7Lybt3f7I= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 8 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13056 - LdsInitCVgprs: false - LdsNumBytes: 13056 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlMsVS0gVUWkH3AExJlZmmYK6npX79DQmOZOqfQ-reTc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 4 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA7FBspZCHlbfbDKx8oD6wXa6U64Azerl5XZGHYTMC-Gs= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 2 - LVCA: 4 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6528 - LdsInitCVgprs: false - LdsNumBytes: 6528 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAwJvHxKN5tg75Sp_5g7s9RMP3wv4WzIQNjssTW1YSxB0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 8 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [3, 0.08] - - - [129, 128, 1, 128, 129, 129, 128, 128] - - [2, 0.07] - - - [128, 129, 1, 128, 128, 128, 128, 129] - - [1, 0.08] - - - [128, 128, 1, 129, 128, 128, 129, 128] - [0, 0.08] - - - [129, 129, 1, 128, 129, 129, 128, 129] - - [5, 0.07] - - - [129, 128, 1, 129, 129, 129, 129, 128] - - [4, 0.08] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [6, 0.07] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 6a09fdb3faab..213cebe1154e 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1201/Equality/gfx1201_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,1819 +89,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAjS5FvVEezs2lcRsTtVmGR-aPCZ4PyEz4TS9Q5TV3AEU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 1 - LVCA: 8 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAxdekMXyoBnaw8oe3vrLNklpTVCB_DbFyo5mWV4gGEVk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 1 - LSPB: 8 - LVCA: 32 - LVCB: 4 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAlkb4g3hYd2Ug1vQuFNo1sX2ASflzLV2vqjnC-evih3w= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 4 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA6qosFeA-HRbtOduZV-BrtS2p4okMa3p3LMFg2sCKRL8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 4 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAia-sIZol4cC_o1eSiq9SHyL3Zg22Xxh2FGk60q6KEXU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 5120 - LdsInitCVgprs: false - LdsNumBytes: 5120 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 8 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAes6gb3cYte_OgoONp1kpyfxv77n24deIis_dCJB50Ck= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 1] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 1 - LSPB: 4 - LVCA: 32 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2560 - LdsInitCVgprs: false - LdsNumBytes: 2560 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA7p8-kkOxKWtIwdWAfbGndlz2r4n7dzs4_xZweOgKk34= + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAaABph6osUKON-kGZ0d9XYIH3_vXdZUYgjn6Qfg8R2qw= BufferLoad: true BufferStore: true CUCount: null @@ -1926,7 +114,7 @@ ExpertSchedulingMode: 2 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer @@ -1944,13 +132,13 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 64 LSCB: 64 - LSPA: 4 + LSPA: 2 LSPB: 4 - LVCA: 8 + LVCA: 16 LVCB: 8 LVPA: 1 LVPB: 1 @@ -2029,11 +217,11 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 + NumLoadsA: 8 NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularA: 8 NumLoadsPerpendicularB: 4 NumThreads: 32 NumWaveSplitK: 1 @@ -2042,7 +230,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 + PrefetchGlobalRead: 1 PrefetchLocalRead: 0 PreloadKernArgs: 0 ProblemType: @@ -2125,9 +313,9 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 3 ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1201_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -2135,7 +323,7 @@ StoreRemapVectorWidth: 0 StoreSwapAddr: false StoreSyncOpt: 0 - StoreVectorWidth: 1 + StoreVectorWidth: 8 StreamK: 0 StreamKAtomic: 0 StreamKXCCMapping: 0 @@ -2149,12 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: true + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -2194,19 +382,7 @@ tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [2, 0.08] - - - [129, 128, 1, 128, 129, 129, 128, 128] - [0, 0.08] - - - [128, 129, 1, 128, 128, 128, 128, 128] - - [3, 0.08] - - - [128, 128, 1, 129, 128, 128, 129, 129] - - [4, 0.08] - - - [129, 129, 1, 128, 129, 129, 128, 128] - - [1, 0.07] - - - [129, 128, 1, 129, 129, 129, 129, 129] - - [5, 0.08] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [6, 0.07] - null - null - DeviceEfficiency From 910fbbef1e83e655d58a9d2acbdadca134bb515c Mon Sep 17 00:00:00 2001 From: wencchen Date: Tue, 24 Jun 2025 08:35:58 +0000 Subject: [PATCH 5/5] keep only size=128 kernsl gfx1200 --- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1854 +--------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1874 +--------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1882 +---------------- ..._HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml | 1860 +--------------- 4 files changed, 87 insertions(+), 7383 deletions(-) diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 8b0b13759eeb..ba5fcf8590a7 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,12 +89,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA5V5zNUlGs0XPWItJNtxIjbzEDM2o5rgUbaeh6pvDflg= + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAKgPe_Wv9DOQxiTMf0uqv29biFXQUfeD7e0IgqktbPPA= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 1 + ClusterLocalRead: 0 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' @@ -111,10 +111,10 @@ EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 + ExpertSchedulingMode: 2 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 + GlobalReadVectorWidthA: 4 GlobalReadVectorWidthB: 1 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer @@ -122,7 +122,7 @@ GlobalSplitUWorkGroupMappingRoundRobin: false GlobalWriteVectorWidth: 1 GroupLoadStore: false - GuaranteeNoPartialA: true + GuaranteeNoPartialA: false GuaranteeNoPartialB: true GuaranteeNoPartialMetadata: true ISA: [12, 0, 0] @@ -132,13 +132,13 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 16 LSCB: 16 - LSPA: 2 + LSPA: 8 LSPB: 2 - LVCA: 16 + LVCA: 4 LVCB: 16 LVPA: 2 LVPB: 2 @@ -217,11 +217,11 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 + NumLoadsA: 4 NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularA: 4 NumLoadsPerpendicularB: 16 NumThreads: 32 NumWaveSplitK: 1 @@ -230,7 +230,7 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 + PrefetchGlobalRead: 2 PrefetchLocalRead: 0 PreloadKernArgs: 0 ProblemType: @@ -311,10 +311,10 @@ UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 + ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -369,7 +369,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -380,1833 +380,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAFIkupvkyXPNOy7I-3FF-NlWiRrSRhdTRsYznF_8mqT4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAtkbH-xuD8zpHvBJuf97ihx0uFoWP2EYo0izJhNX4WJc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 8 - LVCA: 16 - LVCB: 4 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 8 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAj86OHcyuh-nqlgm6yAY_J2GzH4bz2W9rUztoV9_MkAE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2304 - LdsInitCVgprs: false - LdsNumBytes: 2304 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA0hdwoQwbVKOxsM8a30UVKSctJj0DHVJikyfbBcQ3bpU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 2 - LSPB: 16 - LVCA: 16 - LVCB: 2 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAv6Qm8D3QgMzNokWPYkh__asDLpAF8tQne5Mfh2u3Mu0= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 2 - LVCB: 2 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 4608 - LdsInitCVgprs: false - LdsNumBytes: 4608 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAZZe5UWKNNiTUva0Vn_gwbYC-HYHLPFuT_vWg-6rEdGY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 16 - LSPA: 16 - LSPB: 16 - LVCA: 2 - LVCB: 2 - LVPA: 2 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 4608 - LdsInitCVgprs: false - LdsNumBytes: 4608 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 4608 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [2, 0.05] - - - [129, 128, 1, 128, 129, 129, 129, 128] - - [1, 0.05] - - - [128, 129, 1, 128, 128, 128, 128, 129] - - [0, 0.05] - - - [128, 128, 1, 129, 128, 128, 128, 128] - - [5, 0.05] - - - [129, 129, 1, 128, 129, 129, 129, 129] - - [4, 0.06] - - - [129, 128, 1, 129, 129, 129, 129, 128] - - [3, 0.05] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [6, 0.05] + - [0, 0.06] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 1c69ca18fd60..9b50fadbda3d 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -78,7 +78,7 @@ UseScaleAB: '' UseScaleAlphaVec: 1 UseScaleCD: false -- - 1LDSBuffer: 0 +- - 1LDSBuffer: 1 ActivationAlt: false ActivationFuncCall: true ActivationFused: true @@ -89,12 +89,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA3UxsVRViHwneNhGe8_9RC-Vi_-62zTPt7hDhN5sx8Dk= + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAfFwDhrLGpf_fJnNGMQ8jmNjRh0cirDpbLofAguxu_uo= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' @@ -111,11 +111,11 @@ EnableF32XdlMathOp: false EnableMatrixInstruction: true ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 + ExpertSchedulingMode: 2 ForceDisableShadowInit: false GlobalReadPerMfma: 1 GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -132,34 +132,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 16 LSCB: 32 LSPA: 2 - LSPB: 4 + LSPB: 8 LVCA: 16 - LVCB: 8 + LVCB: 4 LVPA: 2 LVPB: 1 - LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadA: 256 LdsBlockSizePerPadB: 128 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 + LdsBytesNoAmax: 2432 LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 + LdsNumBytes: 2432 + LdsNumElementsAlignedA: 1152 LdsNumElementsAlignedB: 1280 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 + LdsOffsetMetadata: 2432 + LdsOffsetMetadata_Blk: 5248 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -218,11 +218,11 @@ NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 NumLoadsA: 16 - NumLoadsB: 4 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularB: 2 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -230,8 +230,8 @@ PackedC0IndicesX: [0] PackedC1IdxChars: [J] PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 PreloadKernArgs: 0 ProblemType: Activation: true @@ -311,10 +311,10 @@ UseScaleAlphaVec: 1 UseScaleCD: false ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 + ScheduleIterAlg: 3 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -337,12 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 1 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: false + UnrollMajorLDSB: true UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -380,1833 +380,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserALBrxjIVbe9xsypJ5iqusuD5y0ud2PuBb2rSjn3Fw-SM= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAj_fcRH5vrhIwQdBt_3sdYhX6ET7X1VPQJPgk24nhRqI= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 4 - LVCB: 8 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2560 - LdsInitCVgprs: false - LdsNumBytes: 2560 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA2J76fzRZkTaEu-z1WIzqqf2DeSnWu1jdlOp-H7MRWoc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: false - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 64 - LSPA: 16 - LSPB: 2 - LVCA: 2 - LVCB: 16 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13056 - LdsInitCVgprs: false - LdsNumBytes: 13056 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAulbyz5s8NOqfSnEb6wkSAxlObkqj6m-qfyp7kM_diV4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 4 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAFtT7EEQ04cTZjRVT3zVfMwAR8dcpsl_M5L8XKOevfhk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 32 - LSPA: 2 - LSPB: 1 - LVCA: 16 - LVCB: 32 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 16 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 16 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAi3k3mvRBMsWBJceNQHEK5UQzpno8XAZ9YzFqSDZB8mE= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 1 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 16 - LSCB: 64 - LSPA: 2 - LSPB: 4 - LVCA: 16 - LVCB: 8 - LVPA: 2 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13056 - LdsInitCVgprs: false - LdsNumBytes: 13056 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: false - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 32 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 32 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [0, 3, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 1 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: true - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 0 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA1_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: false - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [1, 0.05] - - - [129, 128, 1, 128, 129, 129, 129, 128] - [0, 0.05] - - - [128, 129, 1, 128, 128, 128, 128, 128] - - [2, 0.05] - - - [128, 128, 1, 129, 128, 128, 128, 129] - - [3, 0.05] - - - [129, 129, 1, 128, 129, 129, 129, 128] - - [6, 0.05] - - - [129, 128, 1, 129, 129, 129, 129, 129] - - [5, 0.05] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [4, 0.05] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 380216396ecd..dfef89b7a94b 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,7 +89,7 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAYEduLLrkRwW7nvY8FjyDOKR0X2mjHoXXQBVGyvRU7tY= + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoFez-_kQ-PuatlsN5JlmSHh4T79Tn3BVV663YZB3q9U= BufferLoad: true BufferStore: true CUCount: null @@ -114,8 +114,8 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -123,7 +123,7 @@ GlobalWriteVectorWidth: 1 GroupLoadStore: false GuaranteeNoPartialA: true - GuaranteeNoPartialB: true + GuaranteeNoPartialB: false GuaranteeNoPartialMetadata: true ISA: [12, 0, 0] InnerUnroll: 1 @@ -132,34 +132,34 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 32 LSCB: 16 - LSPA: 4 - LSPB: 2 - LVCA: 8 - LVCB: 16 + LSPA: 8 + LSPB: 16 + LVCA: 4 + LVCB: 2 LVPA: 1 LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 + LdsBlockSizePerPadA: 256 + LdsBlockSizePerPadB: 256 LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 + LdsBytesNoAmax: 6400 LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 + LdsNumBytes: 6400 + LdsNumElementsAlignedA: 1152 + LdsNumElementsAlignedB: 1152 LdsNumElementsAlignedMetadata: 0 LdsOffsetA: 0 LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 + LdsOffsetB: 1152 + LdsOffsetB_Blk: 5248 LdsOffsetBias: 0 LdsOffsetBiasGSU: 0 LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 + LdsOffsetMetadata: 1152 + LdsOffsetMetadata_Blk: 5248 LdsPadA: 16 LdsPadB: 16 LdsPadMetadata: 0 @@ -217,12 +217,12 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 + NumLoadsA: 2 + NumLoadsB: 2 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -314,7 +314,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -337,12 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 2 + TransposeLDS: 0 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -369,7 +369,7 @@ _DepthUB: 32 _DepthUMetadata: 32 _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 + _UseSgprForGRO: false _VectorStore: 1 _WorkspaceSizePerElemBias: 0 _WorkspaceSizePerElemC: 4 @@ -380,1833 +380,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAoFez-_kQ-PuatlsN5JlmSHh4T79Tn3BVV663YZB3q9U= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 4 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA9V1wf0B_vGaplRTk3izVhllX70Yr5PZZ3rsFX1Y-HuU= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 8 - LSPB: 16 - LVCA: 4 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA78hixZ9neUtij29lh2vSJarVEjvw_7ZFo2z_RBvK2A4= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 8 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2432 - LdsInitCVgprs: false - LdsNumBytes: 2432 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2432 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR1_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAEQCWJH6i7g1_rcAtIc39DbIkNegtcFcfxStZnu4ZfSY= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 8 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 12800 - LdsInitCVgprs: false - LdsNumBytes: 12800 - LdsNumElementsAlignedA: 2304 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2304 - LdsOffsetB_Blk: 10496 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2304 - LdsOffsetMetadata_Blk: 10496 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAQSVUBgP6vRZTU6NN5trNFYaDfuAm3uE7tIdCRN4zTls= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: false - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 16 - LSPA: 4 - LSPB: 16 - LVCA: 8 - LVCB: 2 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13056 - LdsInitCVgprs: false - LdsNumBytes: 13056 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: false - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserADhaRDIyWP7M-cYFjykXduhHTiSyIkrHua64yTGTLGc8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 16 - LSPA: 2 - LSPB: 2 - LVCA: 16 - LVCB: 16 - LVPA: 1 - LVPB: 2 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13056 - LdsInitCVgprs: false - LdsNumBytes: 13056 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2304 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: false - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 8 - NumLoadsB: 32 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 8 - NumLoadsPerpendicularB: 32 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [1, 3, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 1 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: true - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 1 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: false - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 0 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [3, 0.05] - - - [129, 128, 1, 128, 129, 129, 128, 128] - - [6, 0.05] - - - [128, 129, 1, 128, 128, 128, 128, 129] - - [0, 0.05] - - - [128, 128, 1, 129, 128, 128, 129, 128] - - [1, 0.05] - - - [129, 129, 1, 128, 129, 129, 128, 129] - - [4, 0.05] - - - [129, 128, 1, 129, 129, 129, 129, 128] - - [2, 0.05] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [5, 0.05] + - [0, 0.06] - null - null - DeviceEfficiency diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml index 8a14a09357f7..7ad26d4f7334 100644 --- a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1200/Equality/gfx1200_Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs.yaml @@ -89,12 +89,12 @@ AssertSummationElementMultiple: 1 AssignedDerivedParameters: true AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAEFxT2akH41gl7rr3zSi_67_N9aj_4Ho8Aswbd7LIBYE= + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserASXY4cG5lfvIhZiZwG5Ph-9MbindmMWVof0XwyUcbwv4= BufferLoad: true BufferStore: true CUCount: null CUOccupancy: -1 - ClusterLocalRead: 0 + ClusterLocalRead: 1 CodeObjectVersion: '4' ConvertAfterDS: false CustomKernelName: '' @@ -114,8 +114,8 @@ ExpertSchedulingMode: 0 ForceDisableShadowInit: false GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 8 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 4 GlobalSplitU: 1 GlobalSplitUAlgorithm: MultipleBuffer GlobalSplitUCoalesced: false @@ -132,14 +132,14 @@ SupportUserGSU: true, UseUniversalArgs: true} Kernel: true KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 LDSTrInst: false LSCA: 32 LSCB: 32 - LSPA: 4 - LSPB: 8 - LVCA: 8 - LVCB: 4 + LSPA: 8 + LSPB: 4 + LVCA: 4 + LVCB: 8 LVPA: 1 LVPB: 1 LdsBlockSizePerPadA: 128 @@ -217,12 +217,12 @@ NumElementsPerBatchStore: 0 NumElementsPerThread: 8 NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 2 + NumLoadsA: 2 + NumLoadsB: 4 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 2 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 NumThreads: 32 NumWaveSplitK: 1 OptNoLoadLoop: 1 @@ -314,7 +314,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 SourceSwap: 0 StaggerU: 32 StaggerUMapping: 0 @@ -337,12 +337,12 @@ ThreadTile1: 1 ThreadTileA: 8 ThreadTileB: 1 - TransposeLDS: 1 + TransposeLDS: 2 TransposeLDSMetadata: true ULSGRODoubleG2L: 0 UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true + UnrollMajorLDSA: 1 + UnrollMajorLDSB: 1 UnrollMajorLDSMetadata: true Use64bShadowLimit: 1 UseDotInstruction: false @@ -380,1833 +380,9 @@ reorderGRInstForDTVB: false tailLoopOptA: false tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAbUKAPKpEERts3iuNwJCTMwbDh9PLx70dRrVOarocaG8= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 4 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA_cU9g7p-BLxgjjO2NEiLz4eWQWXogS-KJyfVigajQCc= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 1 - LVCA: 8 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 256 - LdsBlockSizePerPadB: 256 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6400 - LdsInitCVgprs: false - LdsNumBytes: 6400 - LdsNumElementsAlignedA: 1152 - LdsNumElementsAlignedB: 1152 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1152 - LdsOffsetB_Blk: 5248 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1152 - LdsOffsetMetadata_Blk: 5248 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA256_LBSPPB256_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS0_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 0 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 0 - UnrollMajorLDSB: 0 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA-tQUyMYVl9AYSwQzDimPbuzM4MsX5ocNjbdvR3Q9kIo= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 8 - LSPB: 4 - LVCA: 4 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 2560 - LdsInitCVgprs: false - LdsNumBytes: 2560 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 0 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 8 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 2 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: 1 - UnrollMajorLDSB: 1 - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserA8lrRX_Ke3UuEGIY_CZIvxXTEwqKIJFVOV_aVcyGIC48= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 0 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 32 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 4 - GlobalReadVectorWidthB: 1 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 32 - LSCB: 32 - LSPA: 4 - LSPB: 1 - LVCA: 8 - LVCB: 32 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 6656 - LdsInitCVgprs: false - LdsNumBytes: 6656 - LdsNumElementsAlignedA: 1280 - LdsNumElementsAlignedB: 1280 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 4096 - LdsOffsetB: 1280 - LdsOffsetB_Blk: 5376 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 1280 - LdsOffsetMetadata_Blk: 5376 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 2 - LoopUnroll: 32 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 32 - _DepthUA: 32 - _DepthUB: 32 - _DepthUMetadata: 32 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 2 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 0 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAix5OfNB4tktC22obSHl7a_Ne-mnN05cUd1_kdWLRBAA= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 0 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 4 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 2 - LVCA: 8 - LVCB: 16 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 13312 - LdsInitCVgprs: false - LdsNumBytes: 13312 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 2560 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 8 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 8 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 2 - PrefetchLocalRead: 1 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB4_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SIA1_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - - 1LDSBuffer: 1 - ActivationAlt: false - ActivationFuncCall: true - ActivationFused: true - AssertAIGreaterThanEqual: -1 - AssertAILessThanEqual: -1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: true - AssignedProblemIndependentDerivedParameters: true - BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAZlEgPLQuvcw9KFARFIEUcCi_mwVtU1IwhuZ3KNTPSLk= - BufferLoad: true - BufferStore: true - CUCount: null - CUOccupancy: -1 - ClusterLocalRead: 1 - CodeObjectVersion: '4' - ConvertAfterDS: false - CustomKernelName: '' - DebugStreamK: 0 - DepthU: 64 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DirectToVgprA: 0 - DirectToVgprB: 0 - DirectToVgprSparseMetadata: false - EdgeType: ShiftPtr - EnableF32XEmulationLds: false - EnableF32XdlMathOp: false - EnableMatrixInstruction: true - ExpandPointerSwap: 0 - ExpertSchedulingMode: 2 - ForceDisableShadowInit: false - GlobalReadPerMfma: 1 - GlobalReadVectorWidthA: 8 - GlobalReadVectorWidthB: 8 - GlobalSplitU: 1 - GlobalSplitUAlgorithm: MultipleBuffer - GlobalSplitUCoalesced: false - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GroupLoadStore: false - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - GuaranteeNoPartialMetadata: true - ISA: [12, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, - SupportUserGSU: true, UseUniversalArgs: true} - Kernel: true - KernelLanguage: Assembly - KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1 - LDSTrInst: false - LSCA: 64 - LSCB: 64 - LSPA: 4 - LSPB: 4 - LVCA: 8 - LVCB: 8 - LVPA: 1 - LVPB: 1 - LdsBlockSizePerPadA: 128 - LdsBlockSizePerPadB: 128 - LdsBlockSizePerPadMetadata: 0 - LdsBytesNoAmax: 5120 - LdsInitCVgprs: false - LdsNumBytes: 5120 - LdsNumElementsAlignedA: 2560 - LdsNumElementsAlignedB: 2560 - LdsNumElementsAlignedMetadata: 0 - LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 - LdsOffsetB: 2560 - LdsOffsetB_Blk: 10752 - LdsOffsetBias: 0 - LdsOffsetBiasGSU: 0 - LdsOffsetBiasNonGSU: 0 - LdsOffsetMetadata: 5120 - LdsOffsetMetadata_Blk: 10752 - LdsPadA: 16 - LdsPadB: 16 - LdsPadMetadata: 0 - LocalReadVectorWidth: 8 - LocalSplitU: 1 - LocalSplitUReuseLDS: 1 - LocalWritePerMfma: -1 - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopIters: 4 - LoopUnroll: 64 - MFMA_BF16_1K: 0 - MIArchVgpr: true - MIBlock: [16, 16, 16, 1, 1, 1] - MIInputPerThread: 8 - MIInputPerThreadA: 8 - MIInputPerThreadB: 8 - MIInputPerThreadMetadata: 8 - MIOutputVectorWidth: 8 - MIRegPerOut: 1 - MIWaveGroup: [1, 1] - MIWaveTile: [1, 1] - MIWaveTileA: 1 - MIWaveTileB: 1 - MIWaveTileMetadata: 0 - MacroTile0: 16 - MacroTile1: 16 - MacroTileA: 16 - MacroTileB: 16 - MagicDivAlg: 2 - MathClocksUnrolledLoop: 0 - MatrixInstB: 1 - MatrixInstBM: 1 - MatrixInstBN: 1 - MatrixInstK: 16 - MatrixInstM: 16 - MatrixInstN: 16 - MatrixInstruction: [16, 16, 16, 1] - MaxLDS: 65536 - MaxOccupancy: 40 - MbskPrefetchOpt: 0 - NoLdsWriteCode: false - NoReject: false - NoTailLoop: false - NonDTLTailLoopA: true - NonDTLTailLoopB: true - NonTemporal: -1 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NonTemporalD: 0 - NonTemporalE: 0 - NonTemporalMetadata: 0 - NonTemporalWS: 0 - NumElementsPerBatchStore: 0 - NumElementsPerThread: 8 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 4 - NumLoadsB: 4 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 4 - NumThreads: 32 - NumWaveSplitK: 1 - OptNoLoadLoop: 1 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PrefetchGlobalRead: 1 - PrefetchLocalRead: 0 - PreloadKernArgs: 0 - ProblemType: - Activation: true - ActivationComputeDataType: 0 - ActivationNoGuard: false - ActivationType: hipblaslt_all - AllowNoFreeDims: false - AssignedDerivedParameters: true - Batched: true - BetaOnlyUseBias: false - BiasDataTypeList: [0] - BiasSrc: D - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 4 - DataTypeA: 4 - DataTypeAmaxD: 0 - DataTypeB: 4 - DataTypeE: 4 - DestDataType: 4 - F32XdlMathOp: 0 - Gradient: false - GroupedGemm: false - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexAssignmentsMetadata: [3, 0, 2] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndexUnrollM: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - MirrorDimsA: [] - MirrorDimsB: [] - MirrorDimsMetadata: [] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - OutputAmaxD: false - SetConstStrideA: [] - SetConstStrideB: [] - SetConstStrideBias: [] - SilentHighPrecisionAccumulate: false - Sparse: 0 - StochasticRounding: false - StridedBatched: true - SupportUserArgs: true - SwizzleTensorA: false - SwizzleTensorB: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: 1 - TransposeB: 0 - UseBeta: true - UseBias: 1 - UseE: true - UseInitialStridesAB: false - UseInitialStridesCD: false - UseScaleAB: '' - UseScaleAlphaVec: 1 - UseScaleCD: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 3 - ScheduleLocalWrite: 1 - SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x64_MI16x16x1_SN_LDSB1_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR1_PLR0_PKA0_SIA3_SS1_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW1_SK0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1 - SourceSwap: 1 - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - StorePriorityOpt: false - StoreRemapVectorWidth: 0 - StoreSwapAddr: false - StoreSyncOpt: 0 - StoreVectorWidth: 1 - StreamK: 0 - StreamKAtomic: 0 - StreamKXCCMapping: 0 - SubGroup0: 2 - SubGroup1: 16 - SubGroupA: 2 - SubGroupB: 16 - SuppressNoLoadLoop: false - ThreadTile: [1, 1] - ThreadTile0: 8 - ThreadTile1: 1 - ThreadTileA: 8 - ThreadTileB: 1 - TransposeLDS: 1 - TransposeLDSMetadata: true - ULSGRODoubleG2L: 0 - UnrollLoopSwapGlobalReadOrder: 0 - UnrollMajorLDSA: true - UnrollMajorLDSB: true - UnrollMajorLDSMetadata: true - Use64bShadowLimit: 1 - UseDotInstruction: false - UseF32XEmulation: false - UseInstOffsetForGRO: 0 - UseSgprForGRO: -1 - Valid: true - VectorStore: -1 - VectorWidthA: 1 - VectorWidthB: 1 - WaveSeparateGlobalReadA: 0 - WaveSeparateGlobalReadB: 0 - WaveSeparateGlobalReadMetadata: 0 - WaveSplitK: false - WavefrontSize: 32 - WorkGroup: [16, 2, 1] - WorkGroupMapping: 8 - WorkGroupMappingXCC: 1 - WorkGroupMappingXCCGroup: -1 - WorkGroupReduction: false - WorkspaceCheck: [4, 0, 1] - _DepthU: 64 - _DepthUA: 64 - _DepthUB: 64 - _DepthUMetadata: 64 - _GlobalAccumulation: MultipleBuffer - _UseSgprForGRO: 1 - _VectorStore: 1 - _WorkspaceSizePerElemBias: 0 - _WorkspaceSizePerElemC: 4 - _staggerStrideShift: 1 - enableLDSTrA: false - enableLDSTrB: false - reorderGRInstForDTVA: false - reorderGRInstForDTVB: false - tailLoopOptA: false - tailLoopOptB: false - [2, 3, 0, 1] - - - [128, 128, 1, 128, 128, 128, 128, 128] - - [3, 0.06] - - - [129, 128, 1, 128, 129, 129, 128, 128] - - [0, 0.05] - - - [128, 129, 1, 128, 128, 128, 128, 128] - - [5, 0.05] - - - [128, 128, 1, 129, 128, 128, 129, 129] - - [2, 0.05] - - - [129, 129, 1, 128, 129, 129, 128, 128] - - [1, 0.05] - - - [129, 128, 1, 129, 129, 129, 129, 129] - - [6, 0.05] - - - [129, 129, 1, 129, 129, 129, 129, 129] - - [4, 0.05] + - [0, 0.06] - null - null - DeviceEfficiency